diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/include | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-upstream.tar.xz mariadb-10.5-upstream.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/include')
211 files changed, 74630 insertions, 0 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h new file mode 100644 index 00000000..7fae1ad1 --- /dev/null +++ b/storage/innobase/include/btr0btr.h @@ -0,0 +1,760 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.h +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0btr_h +#define btr0btr_h + +#include "dict0dict.h" +#include "data0data.h" +#include "rem0types.h" +#include "page0cur.h" +#include "btr0types.h" +#include "gis0type.h" + +#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level + (not really a hard limit). + Used in debug assertions + in btr_page_set_level and + btr_page_get_level */ + +/** Maximum record size which can be stored on a page, without using the +special big record storage structure */ +#define BTR_PAGE_MAX_REC_SIZE (srv_page_size / 2 - 200) + +/** @brief Maximum depth of a B-tree in InnoDB. + +Note that this isn't a maximum as such; none of the tree operations +avoid producing trees bigger than this. It is instead a "max depth +that other code must work with", useful for e.g. fixed-size arrays +that must store some information about each level in a tree. In other +words: if a B-tree with bigger depth than this is encountered, it is +not acceptable for it to lead to mysterious memory corruption, but it +is acceptable for the program to die with a clear assert failure. */ +#define BTR_MAX_LEVELS 100 + +/** Latching modes for btr_cur_search_to_nth_level(). */ +enum btr_latch_mode { + /** Search a record on a leaf page and S-latch it. */ + BTR_SEARCH_LEAF = RW_S_LATCH, + /** (Prepare to) modify a record on a leaf page and X-latch it. */ + BTR_MODIFY_LEAF = RW_X_LATCH, + /** Obtain no latches. */ + BTR_NO_LATCHES = RW_NO_LATCH, + /** Start modifying the entire B-tree. */ + BTR_MODIFY_TREE = 33, + /** Continue modifying the entire B-tree. */ + BTR_CONT_MODIFY_TREE = 34, + /** Search the previous record. */ + BTR_SEARCH_PREV = 35, + /** Modify the previous record. */ + BTR_MODIFY_PREV = 36, + /** Start searching the entire B-tree. */ + BTR_SEARCH_TREE = 37, + /** Continue searching the entire B-tree. */ + BTR_CONT_SEARCH_TREE = 38, + + /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually + exclusive. */ + /** The search tuple will be inserted to the secondary index + at the searched position. When the leaf page is not in the + buffer pool, try to use the change buffer. */ + BTR_INSERT = 512, + + /** Try to delete mark a secondary index leaf page record at + the searched position using the change buffer when the page is + not in the buffer pool. */ + BTR_DELETE_MARK = 4096, + + /** Try to purge the record using the change buffer when the + secondary index leaf page is not in the buffer pool. */ + BTR_DELETE = 8192, + + /** The caller is already holding dict_index_t::lock S-latch. */ + BTR_ALREADY_S_LATCHED = 16384, + /** Search and S-latch a leaf page, assuming that the + dict_index_t::lock S-latch is being held. */ + BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF + | BTR_ALREADY_S_LATCHED, + /** Search the entire index tree, assuming that the + dict_index_t::lock S-latch is being held. */ + BTR_SEARCH_TREE_ALREADY_S_LATCHED = BTR_SEARCH_TREE + | BTR_ALREADY_S_LATCHED, + /** Search and X-latch a leaf page, assuming that the + dict_index_t::lock S-latch is being held. */ + BTR_MODIFY_LEAF_ALREADY_S_LATCHED = BTR_MODIFY_LEAF + | BTR_ALREADY_S_LATCHED, + + /** Attempt to delete-mark a secondary index record. */ + BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK, + /** Attempt to delete-mark a secondary index record + while holding the dict_index_t::lock S-latch. */ + BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF + | BTR_ALREADY_S_LATCHED, + /** Attempt to purge a secondary index record. */ + BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE, + /** Attempt to purge a secondary index record + while holding the dict_index_t::lock S-latch. */ + BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF + | BTR_ALREADY_S_LATCHED, + + /** In the case of BTR_MODIFY_TREE, the caller specifies + the intention to delete record only. It is used to optimize + block->lock range.*/ + BTR_LATCH_FOR_DELETE = 65536, + + /** Attempt to purge a secondary index record in the tree. */ + BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE +}; + +/** This flag ORed to btr_latch_mode says that we do the search in query +optimization */ +#define BTR_ESTIMATE 1024U + +/** This flag ORed to BTR_INSERT says that we can ignore possible +UNIQUE definition on secondary indexes when we decide if we can use +the insert buffer to speed up inserts */ +#define BTR_IGNORE_SEC_UNIQUE 2048U + +/** In the case of BTR_MODIFY_TREE, the caller specifies the intention +to insert record only. It is used to optimize block->lock range.*/ +#define BTR_LATCH_FOR_INSERT 32768U + +/** This flag is for undo insert of rtree. For rtree, we need this flag +to find proper rec to undo insert.*/ +#define BTR_RTREE_UNDO_INS 131072U + +/** In the case of BTR_MODIFY_LEAF, the caller intends to allocate or +free the pages of externally stored fields. */ +#define BTR_MODIFY_EXTERNAL 262144U + +/** Try to delete mark the record at the searched position when the +record is in spatial index */ +#define BTR_RTREE_DELETE_MARK 524288U + +#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ + ((latch_mode) & ulint(~(BTR_INSERT \ + | BTR_DELETE_MARK \ + | BTR_RTREE_UNDO_INS \ + | BTR_RTREE_DELETE_MARK \ + | BTR_DELETE \ + | BTR_ESTIMATE \ + | BTR_IGNORE_SEC_UNIQUE \ + | BTR_ALREADY_S_LATCHED \ + | BTR_LATCH_FOR_INSERT \ + | BTR_LATCH_FOR_DELETE \ + | BTR_MODIFY_EXTERNAL))) + +#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \ + ((latch_mode) & ulint(~(BTR_LATCH_FOR_INSERT \ + | BTR_LATCH_FOR_DELETE \ + | BTR_MODIFY_EXTERNAL))) + +/** Report that an index page is corrupted. +@param[in] buffer block +@param[in] index tree */ +ATTRIBUTE_COLD ATTRIBUTE_NORETURN __attribute__((nonnull)) +void btr_corruption_report(const buf_block_t* block,const dict_index_t* index); + +/** Assert that a B-tree page is not corrupted. +@param block buffer block containing a B-tree page +@param index the B-tree index */ +#define btr_assert_not_corrupted(block, index) \ + if (!!page_is_comp(buf_block_get_frame(block)) \ + != index->table->not_redundant()) \ + btr_corruption_report(block, index) + +/**************************************************************//** +Gets the root node of a tree and sx-latches it for segment access. +@return root page, sx-latched */ +page_t* +btr_root_get( +/*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ + MY_ATTRIBUTE((warn_unused_result)); + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +ulint +btr_height_get( +/*===========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Get an index page and declare its latching order level. +@param[in] index index tree +@param[in] page page number +@param[in] mode latch mode +@param[in] merge whether change buffer merge should be attempted +@param[in] file file name +@param[in] line line where called +@param[in,out] mtr mini-transaction +@return block */ +inline buf_block_t* btr_block_get_func(const dict_index_t& index, + uint32_t page, ulint mode, bool merge, + const char* file, unsigned line, + mtr_t* mtr) +{ + dberr_t err; + + if (buf_block_t* block = buf_page_get_gen( + page_id_t(index.table->space->id, page), + index.table->space->zip_size(), mode, NULL, BUF_GET, + file, line, mtr, &err, merge && !index.is_clust())) { + ut_ad(err == DB_SUCCESS); + if (mode != RW_NO_LATCH) { + buf_block_dbg_add_level(block, index.is_ibuf() + ? SYNC_IBUF_TREE_NODE + : SYNC_TREE_NODE); + } + return block; + } else { + ut_ad(err != DB_SUCCESS); + + if (err == DB_DECRYPTION_FAILED) { + if (index.table) { + index.table->file_unreadable = true; + } + } + + return NULL; + } +} + +/** Gets a buffer page and declares its latching order level. +@param index index tree +@param page page number +@param mode latch mode +@param merge whether change buffer merge should be attempted +@param mtr mini-transaction handle +@return the block descriptor */ +# define btr_block_get(index, page, mode, merge, mtr) \ + btr_block_get_func(index, page, mode, merge, __FILE__, __LINE__, mtr) +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ + MY_ATTRIBUTE((warn_unused_result)); +/** Read the B-tree or R-tree PAGE_LEVEL. +@param page B-tree or R-tree page +@return number of child page links to reach the leaf level +@retval 0 for leaf pages */ +inline uint16_t btr_page_get_level(const page_t *page) +{ + uint16_t level= mach_read_from_2(my_assume_aligned<2> + (PAGE_HEADER + PAGE_LEVEL + page)); + ut_ad(level <= BTR_MAX_NODE_LEVEL); + return level; +} MY_ATTRIBUTE((warn_unused_result)) + +/** Read FIL_PAGE_NEXT. +@param page buffer pool page +@return previous page number */ +inline uint32_t btr_page_get_next(const page_t* page) +{ + return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); +} + +/** Read FIL_PAGE_PREV. +@param page buffer pool page +@return previous page number */ +inline uint32_t btr_page_get_prev(const page_t* page) +{ + return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); +} + +/**************************************************************//** +Releases the latch on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +uint32_t +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Create the root node for a new index tree. +@param[in] type type of the index +@param[in,out] space tablespace where created +@param[in] index_id index id +@param[in] index index, or NULL to create a system table +@param[in,out] mtr mini-transaction +@return page number of the created root +@retval FIL_NULL if did not succeed */ +uint32_t +btr_create( + ulint type, + fil_space_t* space, + index_id_t index_id, + dict_index_t* index, + mtr_t* mtr); + +/** Free a persistent index tree if it exists. +@param[in] page_id root page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction */ +void +btr_free_if_exists( + const page_id_t page_id, + ulint zip_size, + index_id_t index_id, + mtr_t* mtr); + +/** Free an index tree in a temporary tablespace. +@param[in] page_id root page id */ +void btr_free(const page_id_t page_id); + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@return the last used AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc(dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, +or fall back to MAX(auto_increment_column). +@param[in] table table containing an AUTO_INCREMENT column +@param[in] col_no index of the AUTO_INCREMENT column +@return the AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@param[in] autoinc the AUTO_INCREMENT value +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false) + MY_ATTRIBUTE((nonnull)); + +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr); + +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD __attribute__((nonnull)) +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr); + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((warn_unused_result)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +bool +btr_page_reorganize( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/** Decide if the page should be split at the convergence point of inserts +converging to the left. +@param[in] cursor insert position +@return the first record to be moved to the right half page +@retval NULL if no split is recommended */ +rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor); +/** Decide if the page should be split at the convergence point of inserts +converging to the right. +@param[in] cursor insert position +@param[out] split_rec if split recommended, the first record + on the right half page, or + NULL if the to-be-inserted record + should be first +@return whether split is recommended */ +bool +btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec); + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. + +@return inserted record */ +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +void +btr_insert_on_non_leaf_level_func( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_insert_on_non_leaf_level(f,i,l,t,m) \ + btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m) + +/** Set a child page pointer record as the predefined minimum record. +@tparam has_prev whether the page is supposed to have a left sibling +@param[in,out] rec leftmost record on a leftmost non-leaf page +@param[in,out] block buffer pool block +@param[in,out] mtr mini-transaction */ +template<bool has_prev= false> +inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block, + mtr_t *mtr) +{ + ut_ad(block.frame == page_align(rec)); + ut_ad(!page_is_leaf(block.frame)); + ut_ad(has_prev == page_has_prev(block.frame)); + + rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS; + + if (block.page.zip.data) + /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED + page. We are not modifying the compressed page frame at all. */ + *rec|= REC_INFO_MIN_REC_FLAG; + else + mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG); +} + +/** Seek to the parent page of a B-tree page. +@param[in,out] index b-tree +@param[in] block child page +@param[in,out] mtr mini-transaction +@param[out] cursor cursor pointing to the x-latched parent page */ +void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr, + btr_cur_t* cursor) + MY_ATTRIBUTE((nonnull)); +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the +brother reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to +the brothers, if they exist. +@return TRUE on success */ +ibool +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr); /*!< in: mtr */ +/**************************************************************//** +Gets the number of pages in a B-tree. +@return number of pages, or ULINT_UNDEFINED if the index is unavailable */ +ulint +btr_get_size( +/*=========*/ + const dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + MY_ATTRIBUTE((warn_unused_result)); +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + __attribute__((nonnull)); + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + MY_ATTRIBUTE((warn_unused_result)); +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ +void +btr_page_empty( + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) + MY_ATTRIBUTE((nonnull(1, 3, 5))); +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr); /*!< in: mtr */ + +/** Free an index page. +@param[in,out] index index tree +@param[in,out] block block to be freed +@param[in,out] mtr mini-transaction +@param[in] blob whether this is freeing a BLOB page */ +MY_ATTRIBUTE((nonnull)) +void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, + bool blob = false); + +/**************************************************************//** +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ +buf_block_t* +btr_root_block_get( +/*===============*/ + const dict_index_t* index, /*!< in: index tree */ + rw_lock_type_t mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr); /*!< in: mtr */ + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +bool btr_page_reorganize_block( + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ + MY_ATTRIBUTE((nonnull)); +/**************************************************************//** +Prints directories and other info of all nodes in the index. */ +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ + MY_ATTRIBUTE((nonnull)); +#endif /* UNIV_BTR_PRINT */ +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ + MY_ATTRIBUTE((warn_unused_result)); +/**************************************************************//** +Checks the consistency of an index tree. +@return DB_SUCCESS if ok, error code if not */ +dberr_t +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or 0 */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Remove a page from the level list of pages. +@param[in] block page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ +void btr_level_list_remove(const buf_block_t& block, const dict_index_t& index, + mtr_t* mtr); + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +UNIV_INTERN +buf_block_t* +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + +#define BTR_N_LEAF_PAGES 1 +#define BTR_TOTAL_SIZE 2 + +#include "btr0btr.ic" + +/**************************************************************** +Global variable controlling if scrubbing should be performed */ +extern my_bool srv_immediate_scrub_data_uncompressed; +extern Atomic_counter<uint32_t> btr_validate_index_running; + +#endif diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic new file mode 100644 index 00000000..89826e8f --- /dev/null +++ b/storage/innobase/include/btr0btr.ic @@ -0,0 +1,149 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.ic +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0zip.h" + +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ +{ + return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)); +} + +/** Set PAGE_LEVEL. +@param[in,out] block buffer block +@param[in] level page level +@param[in,out] mtr mini-transaction */ +inline +void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr) +{ + ut_ad(level <= BTR_MAX_NODE_LEVEL); + constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL; + byte *b= my_assume_aligned<2>(&block->frame[field]); + if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<2>(&block->page.zip.data[field], b, 2); +} + +/** Set FIL_PAGE_NEXT. +@param[in,out] block buffer block +@param[in] next number of successor page +@param[in,out] mtr mini-transaction */ +inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr) +{ + constexpr uint16_t field= FIL_PAGE_NEXT; + byte *b= my_assume_aligned<4>(&block->frame[field]); + if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<4>(&block->page.zip.data[field], b, 4); +} + +/** Set FIL_PAGE_PREV. +@param[in,out] block buffer block +@param[in] prev number of predecessor page +@param[in,out] mtr mini-transaction */ +inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr) +{ + constexpr uint16_t field= FIL_PAGE_PREV; + byte *b= my_assume_aligned<4>(&block->frame[field]); + if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<4>(&block->page.zip.data[field], b, 4); +} + +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +uint32_t +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + const byte* field; + ulint len; + + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == 4); + + uint32_t page_no = mach_read_from_4(field); + ut_ad(page_no > 1); + + return(page_no); +} + +/**************************************************************//** +Releases the latches on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_MODIFY_LEAF + || latch_mode == BTR_NO_LATCHES); + + ut_ad(!mtr->memo_contains_flagged(block, MTR_MEMO_MODIFY)); + + mtr_memo_type_t mode; + switch (latch_mode) { + case BTR_SEARCH_LEAF: + mode = MTR_MEMO_PAGE_S_FIX; + break; + case BTR_MODIFY_LEAF: + mode = MTR_MEMO_PAGE_X_FIX; + break; + case BTR_NO_LATCHES: + mode = MTR_MEMO_BUF_FIX; + break; + default: + ut_a(0); + } + + mtr->memo_release(block, mode); +} diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h new file mode 100644 index 00000000..943836f8 --- /dev/null +++ b/storage/innobase/include/btr0bulk.h @@ -0,0 +1,371 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0bulk.h +The B-tree bulk load + +Created 03/11/2014 Shaohua Wang +*************************************************************************/ + +#ifndef btr0bulk_h +#define btr0bulk_h + +#include "dict0dict.h" +#include "rem0types.h" +#include "page0cur.h" + +#include <vector> + +/** Innodb B-tree index fill factor for bulk load. */ +extern uint innobase_fill_factor; + +/* +The proper function call sequence of PageBulk is as below: +-- PageBulk::init +-- PageBulk::insert +-- PageBulk::finish +-- PageBulk::compress(COMPRESSED table only) +-- PageBulk::pageSplit(COMPRESSED table only) +-- PageBulk::commit +*/ + +class PageBulk +{ +public: + /** Constructor + @param[in] index B-tree index + @param[in] page_no page number + @param[in] level page level + @param[in] trx_id transaction id */ + PageBulk( + dict_index_t* index, + trx_id_t trx_id, + uint32_t page_no, + ulint level) + : + m_heap(NULL), + m_index(index), + m_mtr(), + m_trx_id(trx_id), + m_block(NULL), + m_page(NULL), + m_page_zip(NULL), + m_cur_rec(NULL), + m_page_no(page_no), + m_level(level), + m_is_comp(dict_table_is_comp(index->table)), + m_heap_top(NULL), + m_rec_no(0), + m_free_space(0), + m_reserved_space(0), +#ifdef UNIV_DEBUG + m_total_data(0), +#endif /* UNIV_DEBUG */ + m_modify_clock(0), + m_err(DB_SUCCESS) + { + ut_ad(!dict_index_is_spatial(m_index)); + ut_ad(!m_index->table->is_temporary()); + } + + /** Deconstructor */ + ~PageBulk() + { + mem_heap_free(m_heap); + } + + /** Initialize members and allocate page if needed and start mtr. + Note: must be called and only once right after constructor. + @return error code */ + dberr_t init(); + + /** Insert a record in the page. + @param[in] rec record + @param[in] offsets record offsets */ + inline void insert(const rec_t* rec, rec_offs* offsets); +private: + /** Page format */ + enum format { REDUNDANT, DYNAMIC, COMPRESSED }; + /** Mark end of insertion to the page. Scan all records to set page + dirs, and set page header members. + @tparam format the page format */ + template<format> inline void finishPage(); + /** Insert a record in the page. + @tparam format the page format + @param[in,out] rec record + @param[in] offsets record offsets */ + template<format> inline void insertPage(rec_t* rec, rec_offs* offsets); + +public: + /** Mark end of insertion to the page. Scan all records to set page + dirs, and set page header members. */ + inline void finish(); + + /** @return whether finish() actually needs to do something */ + inline bool needs_finish() const; + + /** Commit mtr for a page + @param[in] success Flag whether all inserts succeed. */ + void commit(bool success); + + /** Compress if it is compressed table + @return true compress successfully or no need to compress + @return false compress failed. */ + bool compress(); + + /** Check whether the record needs to be stored externally. + @return true + @return false */ + bool needExt(const dtuple_t* tuple, ulint rec_size); + + /** Store external record + @param[in] big_rec external recrod + @param[in] offsets record offsets + @return error code */ + dberr_t storeExt(const big_rec_t* big_rec, rec_offs* offsets); + + /** Get node pointer + @return node pointer */ + dtuple_t* getNodePtr(); + + /** Get split rec in the page. We split a page in half when compresssion + fails, and the split rec should be copied to the new page. + @return split rec */ + rec_t* getSplitRec(); + + /** Copy all records after split rec including itself. + @param[in] rec split rec */ + void copyIn(rec_t* split_rec); + + /** Remove all records after split rec including itself. + @param[in] rec split rec */ + void copyOut(rec_t* split_rec); + + /** Set next page + @param[in] next_page_no next page no */ + inline void setNext(ulint next_page_no); + + /** Set previous page + @param[in] prev_page_no previous page no */ + inline void setPrev(ulint prev_page_no); + + /** Release block by commiting mtr */ + inline void release(); + + /** Start mtr and latch block */ + inline dberr_t latch(); + + /** Check if required space is available in the page for the rec + to be inserted. We check fill factor & padding here. + @param[in] length required length + @return true if space is available */ + inline bool isSpaceAvailable(ulint rec_size); + + /** Get page no */ + uint32_t getPageNo() const { return m_page_no; } + + /** Get page level */ + ulint getLevel() + { + return(m_level); + } + + /** Get record no */ + ulint getRecNo() + { + return(m_rec_no); + } + + /** Get page */ + page_t* getPage() + { + return(m_page); + } + + /** Get page zip */ + page_zip_des_t* getPageZip() + { + return(m_page_zip); + } + + dberr_t getError() + { + return(m_err); + } + + void set_modified() { m_mtr.set_modified(*m_block); } + + /* Memory heap for internal allocation */ + mem_heap_t* m_heap; + +private: + /** The index B-tree */ + dict_index_t* m_index; + + /** The mini-transaction */ + mtr_t m_mtr; + + /** The transaction id */ + trx_id_t m_trx_id; + + /** The buffer block */ + buf_block_t* m_block; + + /** The page */ + page_t* m_page; + + /** The page zip descriptor */ + page_zip_des_t* m_page_zip; + + /** The current rec, just before the next insert rec */ + rec_t* m_cur_rec; + + /** The page no */ + uint32_t m_page_no; + + /** The page level in B-tree */ + ulint m_level; + + /** Flag: is page in compact format */ + const bool m_is_comp; + + /** The heap top in page for next insert */ + byte* m_heap_top; + + /** User record no */ + ulint m_rec_no; + + /** The free space left in the page */ + ulint m_free_space; + + /** The reserved space for fill factor */ + ulint m_reserved_space; + + /** The padding space for compressed page */ + ulint m_padding_space; + +#ifdef UNIV_DEBUG + /** Total data in the page */ + ulint m_total_data; +#endif /* UNIV_DEBUG */ + + /** The modify clock value of the buffer block + when the block is re-pinned */ + ib_uint64_t m_modify_clock; + + /** Operation result DB_SUCCESS or error code */ + dberr_t m_err; +}; + +typedef std::vector<PageBulk*, ut_allocator<PageBulk*> > + page_bulk_vector; + +class BtrBulk +{ +public: + /** Constructor + @param[in] index B-tree index + @param[in] trx transaction */ + BtrBulk( + dict_index_t* index, + const trx_t* trx) + : + m_index(index), + m_trx(trx) + { + ut_ad(!dict_index_is_spatial(index)); + } + + /** Insert a tuple + @param[in] tuple tuple to insert. + @return error code */ + dberr_t insert(dtuple_t* tuple) + { + return(insert(tuple, 0)); + } + + /** Btree bulk load finish. We commit the last page in each level + and copy the last page in top level to the root page of the index + if no error occurs. + @param[in] err whether bulk load was successful until now + @return error code */ + dberr_t finish(dberr_t err); + + /** Release all latches */ + void release(); + + /** Re-latch all latches */ + void latch(); + + table_name_t table_name() { return m_index->table->name; } + +private: + /** Insert a tuple to a page in a level + @param[in] tuple tuple to insert + @param[in] level B-tree level + @return error code */ + dberr_t insert(dtuple_t* tuple, ulint level); + + /** Split a page + @param[in] page_bulk page to split + @param[in] next_page_bulk next page + @return error code */ + dberr_t pageSplit(PageBulk* page_bulk, + PageBulk* next_page_bulk); + + /** Commit(finish) a page. We set next/prev page no, compress a page of + compressed table and split the page if compression fails, insert a node + pointer to father page if needed, and commit mini-transaction. + @param[in] page_bulk page to commit + @param[in] next_page_bulk next page + @param[in] insert_father flag whether need to insert node ptr + @return error code */ + dberr_t pageCommit(PageBulk* page_bulk, + PageBulk* next_page_bulk, + bool insert_father); + + /** Abort a page when an error occurs + @param[in] page_bulk page bulk object + Note: we should call pageAbort for a PageBulk object, which is not in + m_page_bulks after pageCommit, and we will commit or abort PageBulk + objects in function "finish". */ + void pageAbort(PageBulk* page_bulk) + { + page_bulk->commit(false); + } + + /** Log free check */ + inline void logFreeCheck(); + +private: + /** B-tree index */ + dict_index_t*const m_index; + + /** Transaction */ + const trx_t*const m_trx; + + /** Root page level */ + ulint m_root_level; + + /** Page cursor vector for all level */ + page_bulk_vector m_page_bulks; +}; + +#endif diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h new file mode 100644 index 00000000..7136d726 --- /dev/null +++ b/storage/innobase/include/btr0cur.h @@ -0,0 +1,1010 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.h +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0cur_h +#define btr0cur_h + +#include "dict0dict.h" +#include "page0cur.h" +#include "btr0types.h" +#include "rem0types.h" +#include "gis0type.h" +#include "my_base.h" + +/** Mode flags for btr_cur operations; these can be ORed */ +enum { + /** do no undo logging */ + BTR_NO_UNDO_LOG_FLAG = 1, + /** do no record lock checking */ + BTR_NO_LOCKING_FLAG = 2, + /** sys fields will be found in the update vector or inserted + entry */ + BTR_KEEP_SYS_FLAG = 4, + + /** no rollback */ + BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG, + + /** btr_cur_pessimistic_update() must keep cursor position + when moving columns to big_rec */ + BTR_KEEP_POS_FLAG = 8, + /** the caller is creating the index or wants to bypass the + index->info.online creation log */ + BTR_CREATE_FLAG = 16, + /** the caller of btr_cur_optimistic_update() or + btr_cur_update_in_place() will take care of + updating IBUF_BITMAP_FREE */ + BTR_KEEP_IBUF_BITMAP = 32 +}; + +/* btr_cur_latch_leaves() returns latched blocks and savepoints. */ +struct btr_latch_leaves_t { + /* left block, target block and right block */ + buf_block_t* blocks[3]; + ulint savepoints[3]; +}; + +#include "que0types.h" +#include "row0types.h" + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the page cursor component of a tree cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the buffer block on which the tree cursor is positioned. +@return pointer to buffer block */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the record pointer of a tree cursor. +@return pointer to record */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +#else /* UNIV_DEBUG */ +# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur) +# define btr_cur_get_block(cursor) ((cursor)->page_cur.block) +# define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec) +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the index of a cursor. +@param cursor b-tree cursor +@return index */ +#define btr_cur_get_index(cursor) ((cursor)->index) +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor);/*!< in: cursor */ + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)); + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool +btr_cur_instant_root_init(dict_index_t* index, const page_t* page) + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)); + +/** Optimistically latches the leaf page or pages requested. +@param[in] block guessed buffer block +@param[in] modify_clock modify clock value +@param[in,out] latch_mode BTR_SEARCH_LEAF, ... +@param[in,out] cursor cursor +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@return true if success */ +bool +btr_cur_optimistic_latch_leaves( + buf_block_t* block, + ib_uint64_t modify_clock, + ulint* latch_mode, + btr_cur_t* cursor, + const char* file, + unsigned line, + mtr_t* mtr); + +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */ +dberr_t +btr_cur_search_to_nth_level_func( + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be PAGE_CUR_LE, + not PAGE_CUR_GE, as the latter may end up on + the previous page of the record! Inserts + should always be made using PAGE_CUR_LE to + search the position! */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with + at most one of BTR_INSERT, BTR_DELETE_MARK, + BTR_DELETE, or BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if ahi_latch, we might not have a + cursor page latch, we assume that ahi_latch + protects the record! */ + btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: currently held btr_search_latch + (in RW_S_LATCH mode), or NULL */ +#endif /* BTR_CUR_HASH_ADAPT */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ib_uint64_t autoinc = 0); + /*!< in: PAGE_ROOT_AUTO_INC to be written + (0 if none) */ +#ifdef BTR_CUR_HASH_ADAPT +# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \ + btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,a,fi,li,mtr) +#else /* BTR_CUR_HASH_ADAPT */ +# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \ + btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,fi,li,mtr) +#endif /* BTR_CUR_HASH_ADAPT */ + +/*****************************************************************//** +Opens a cursor at either end of an index. +@return DB_SUCCESS or error code */ +dberr_t +btr_cur_open_at_index_side_func( +/*============================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf) */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +#define btr_cur_open_at_index_side(f,i,l,c,lv,m) \ + btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m) + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. +@return true if the index is available and we have put the cursor, false +if the index is unavailable */ +bool +btr_cur_open_at_rnd_pos_func( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_cur_open_at_rnd_pos(i,l,c,m) \ + btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ + MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ + dict_index_t* index, /*!< in: the index corresponding to cursor */ +#ifdef UNIV_DEBUG + rec_offs* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr) +#else /* UNIV_DEBUG */ +# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr) +#endif /* UNIV_DEBUG */ + +/** Apply an update vector to a record. No field size changes are allowed. + +This is usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). +@param[in,out] rec index record +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] update update vector +@param[in,out] block index page +@param[in,out] mtr mini-transaction */ +void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, + const rec_offs *offsets, const upd_t *update, + buf_block_t *block, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page */ +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. +@return DB_SUCCESS or error code */ +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + upd_t* update, /*!< in/out: update vector; this is allowed to + also contain trx id and roll ptr fields. + Non-updated columns that are moved offpage will + be appended to this. */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be committed + before latching any further pages */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + const dtuple_t* entry, /*!< in: dtuple for the deleting record */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return TRUE if compression occurred */ +ibool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if compression + occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************//** +Removes the record on which the tree cursor is positioned. It is assumed +that the mtr has an x-latch on the page where the cursor is positioned, +but no latch on the whole tree. +@return TRUE if success, i.e., the page did not become too empty */ +ibool +btr_cur_optimistic_delete_func( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + cursor stays valid: if deletion succeeds, + on function exit it points to the successor + of the deleted record */ +# ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +# endif /* UNIV_DEBUG */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +# ifdef UNIV_DEBUG +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, flags, mtr) +# else /* UNIV_DEBUG */ +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, mtr) +# endif /* UNIV_DEBUG */ +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred */ +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/** Delete the node pointer in a parent page. +@param[in,out] parent cursor pointing to parent record +@param[in,out] mtr mini-transaction */ +void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); +/***********************************************************//** +Parses a redo log record of updating a record in-place. +@return end of log record or NULL */ +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index); /*!< in: index corresponding to page */ +/** Arguments to btr_estimate_n_rows_in_range */ +struct btr_pos_t +{ + btr_pos_t(dtuple_t *arg_tuple, + page_cur_mode_t arg_mode, + page_id_t arg_page_id) + :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id) + {} + + dtuple_t* tuple; /* Range start or end. May be NULL */ + page_cur_mode_t mode; /* search mode for range */ + page_id_t page_id; /* Out: Page where we found the tuple */ +}; + +/** Estimates the number of rows in a given index range. +@param[in] index index +@param[in/out] range_start +@param[in/out] range_ end +@return estimated number of rows */ +ha_rows +btr_estimate_n_rows_in_range( + dict_index_t* index, + btr_pos_t* range_start, + btr_pos_t* range_end); + + +/** Statistics for one field of an index. */ +struct index_field_stats_t +{ + ib_uint64_t n_diff_key_vals; + ib_uint64_t n_sample_sizes; + ib_uint64_t n_non_null_key_vals; + + index_field_stats_t(ib_uint64_t n_diff_key_vals= 0, + ib_uint64_t n_sample_sizes= 0, + ib_uint64_t n_non_null_key_vals= 0) + : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes), + n_non_null_key_vals(n_non_null_key_vals) + { + } +}; + +/** Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +index->stat_n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. +@param[in] index index +@return stat vector if the index is available and we get the estimated numbers, +empty vector if the index is unavailable. */ +std::vector<index_field_stats_t> +btr_estimate_number_of_different_key_vals(dict_index_t* index); + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const rec_offs* offsets); + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +void +btr_cur_disown_inherited_fields( +/*============================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(2,3,4,5,6))); + +/** Operation code for btr_store_big_rec_extern_fields(). */ +enum blob_op { + /** Store off-page columns for a freshly inserted record */ + BTR_STORE_INSERT = 0, + /** Store off-page columns for an insert by update */ + BTR_STORE_INSERT_UPDATE, + /** Store off-page columns for an update */ + BTR_STORE_UPDATE, + /** Store off-page columns for a freshly inserted record by bulk */ + BTR_STORE_INSERT_BULK +}; + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ + MY_ATTRIBUTE((warn_unused_result)); + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if + btr_mtr is restarted, then this can + be repositioned. */ + rec_offs* offsets, /*!< in/out: rec_get_offsets() on + pcur. the "external storage" flags + in offsets will correctly correspond + to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in/out: mtr containing the + latches to the clustered index. can be + committed and restarted. */ + enum blob_op op) /*! in: operation code */ + MY_ATTRIBUTE((warn_unused_result)); + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + buf_block_t* block, /*!< in/out: page of field_ref */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + bool rollback, /*!< in: performing rollback? */ + mtr_t* local_mtr) /*!< in: mtr containing the latch */ + MY_ATTRIBUTE((nonnull(1,2,5,8))); + +/** Copies the prefix of an externally stored field of a record. +The clustered index record must be protected by a lock or a page latch. +@param[out] buf the field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] local_len length of data, in bytes +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +ulint +btr_copy_externally_stored_field_prefix( + byte* buf, + ulint len, + ulint zip_size, + const byte* data, + ulint local_len); + +/** Copies an externally stored field of a record to mem heap. +The clustered index record must be protected by a lock or a page latch. +@param[out] len length of the whole field +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] local_len length of data +@param[in,out] heap mem heap +@return the whole field copied to heap */ +byte* +btr_copy_externally_stored_field( + ulint* len, + const byte* data, + ulint zip_size, + ulint local_len, + mem_heap_t* heap); + +/** Copies an externally stored field of a record to mem heap. +@param[in] rec record in a clustered index; must be +protected by a lock or a page latch +@param[in] offset array returned by rec_get_offsets() +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] no field number +@param[out] len length of the field +@param[in,out] heap mem heap +@return the field copied to heap, or NULL if the field is incomplete */ +byte* +btr_rec_copy_externally_stored_field( + const rec_t* rec, + const rec_offs* offsets, + ulint zip_size, + ulint no, + ulint* len, + mem_heap_t* heap); + +/** Latches the leaf page or pages requested. +@param[in] block leaf page where the search converged +@param[in] latch_mode BTR_SEARCH_LEAF, ... +@param[in] cursor cursor +@param[in] mtr mini-transaction +@return blocks and savepoints which actually latched. */ +btr_latch_leaves_t +btr_cur_latch_leaves( + buf_block_t* block, + ulint latch_mode, + btr_cur_t* cursor, + mtr_t* mtr); + +/*######################################################################*/ + +/** In the pessimistic delete, if the page data size drops below this +limit, merging it to a neighbor is tried */ +#define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \ + ((srv_page_size * (ulint)((index)->merge_threshold)) / 100) + +/** A slot in the path array. We store here info on a search path down the +tree. Each slot contains data on a single level of the tree. */ +struct btr_path_t { + /* Assume a page like: + records: (inf, a, b, c, d, sup) + index of the record: 0, 1, 2, 3, 4, 5 + */ + + /** Index of the record where the page cursor stopped on this level + (index in alphabetical order). Value ULINT_UNDEFINED denotes array + end. In the above example, if the search stopped on record 'c', then + nth_rec will be 3. */ + ulint nth_rec; + + /** Number of the records on the page, not counting inf and sup. + In the above example n_recs will be 4. */ + ulint n_recs; + + /** Number of the page containing the record. */ + uint32_t page_no; + + /** Level of the page. If later we fetch the page under page_no + and it is no different level then we know that the tree has been + reorganized. */ + ulint page_level; +}; + +#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */ + +/** Values for the flag documenting the used search method */ +enum btr_cur_method { + BTR_CUR_HASH = 1, /*!< successful shortcut using + the hash index */ + BTR_CUR_HASH_FAIL, /*!< failure using hash, success using + binary search: the misleading hash + reference is stored in the field + hash_node, and might be necessary to + update */ + BTR_CUR_BINARY, /*!< success using the binary search */ + BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to + the insert buffer */ + BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete + mark in the insert/delete buffer */ + BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in + the insert/delete buffer */ + BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */ +}; + +/** The tree cursor: the definition appears here only for the compiler +to know struct size! */ +struct btr_cur_t { + dict_index_t* index; /*!< index where positioned */ + page_cur_t page_cur; /*!< page cursor */ + purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */ + buf_block_t* left_block; /*!< this field is used to store + a pointer to the left neighbor + page, in the cases + BTR_SEARCH_PREV and + BTR_MODIFY_PREV */ + /*------------------------------*/ + que_thr_t* thr; /*!< this field is only used + when btr_cur_search_to_nth_level + is called for an index entry + insertion: the calling query + thread is passed here to be + used in the insert buffer */ + /*------------------------------*/ + /** The following fields are used in + btr_cur_search_to_nth_level to pass information: */ + /* @{ */ + enum btr_cur_method flag; /*!< Search method used */ + ulint tree_height; /*!< Tree height if the search is done + for a pessimistic insert or update + operation */ + ulint up_match; /*!< If the search mode was PAGE_CUR_LE, + the number of matched fields to the + the first user record to the right of + the cursor record after + btr_cur_search_to_nth_level; + for the mode PAGE_CUR_GE, the matched + fields to the first user record AT THE + CURSOR or to the right of it; + NOTE that the up_match and low_match + values may exceed the correct values + for comparison to the adjacent user + record if that record is on a + different leaf page! (See the note in + row_ins_duplicate_error_in_clust.) */ + ulint up_bytes; /*!< number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint low_match; /*!< if search mode was PAGE_CUR_LE, + the number of matched fields to the + first user record AT THE CURSOR or + to the left of it after + btr_cur_search_to_nth_level; + NOT defined for PAGE_CUR_GE or any + other search modes; see also the NOTE + in up_match! */ + ulint low_bytes; /*!< number of matched bytes to the + left at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint n_fields; /*!< prefix length used in a hash + search if hash_node != NULL */ + ulint n_bytes; /*!< hash prefix bytes if hash_node != + NULL */ + ulint fold; /*!< fold value used in the search if + flag is BTR_CUR_HASH */ + /* @} */ + btr_path_t* path_arr; /*!< in estimating the number of + rows in range, we store in this array + information of the path through + the tree */ + rtr_info_t* rtr_info; /*!< rtree search info */ + btr_cur_t():thr(NULL), rtr_info(NULL) {} + /* default values */ + /** Zero-initialize all fields */ + void init() + { + index = NULL; + memset(&page_cur, 0, sizeof page_cur); + purge_node = NULL; + left_block = NULL; + thr = NULL; + flag = btr_cur_method(0); + tree_height = 0; + up_match = 0; + up_bytes = 0; + low_match = 0; + low_bytes = 0; + n_fields = 0; + n_bytes = 0; + fold = 0; + path_arr = NULL; + rtr_info = NULL; + } +}; + +/** Modify the delete-mark flag of a record. +@tparam flag the value of the delete-mark flag +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in,out] mtr mini-transaction */ +template<bool flag> +void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Try this many +times. */ +#define BTR_CUR_RETRY_DELETE_N_TIMES 100 +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Sleep this many +microseconds between retries. */ +#define BTR_CUR_RETRY_SLEEP_TIME 50000 + +/** The reference in a field for which data is stored on a different page. +The reference is at the end of the 'locally' stored part of the field. +'Locally' means storage in the index record. +We store locally a long enough prefix of each column so that we can determine +the ordering parts of each index record without looking into the externally +stored part. */ +/*-------------------------------------- @{ */ +#define BTR_EXTERN_SPACE_ID 0U /*!< space id where stored */ +#define BTR_EXTERN_PAGE_NO 4U /*!< page no where stored */ +#define BTR_EXTERN_OFFSET 8U /*!< offset of BLOB header + on that page */ +#define BTR_EXTERN_LEN 12U /*!< 8 bytes containing the + length of the externally + stored part of the BLOB. + The 2 highest bits are + reserved to the flags below. */ +/*-------------------------------------- @} */ +/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */ + +/** The most significant bit of BTR_EXTERN_LEN (i.e., the most +significant bit of the byte at smallest address) is set to 1 if this +field does not 'own' the externally stored field; only the owner field +is allowed to free the field in purge! */ +#define BTR_EXTERN_OWNER_FLAG 128U +/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the +second most significant bit of the byte at smallest address) is 1 then +it means that the externally stored field was inherited from an +earlier version of the row. In rollback we are not allowed to free an +inherited external field. */ +#define BTR_EXTERN_INHERITED_FLAG 64U + +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ +extern Atomic_counter<ulint> btr_cur_n_non_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_non_sea_old; +#ifdef BTR_CUR_HASH_ADAPT +/** Number of successful adaptive hash index lookups in +btr_cur_search_to_nth_level(). */ +extern ulint btr_cur_n_sea; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_sea_old; +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +extern uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +#include "btr0cur.ic" + +#endif diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic new file mode 100644 index 00000000..8a45b714 --- /dev/null +++ b/storage/innobase/include/btr0cur.ic @@ -0,0 +1,211 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.ic +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#ifdef UNIV_DEBUG +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\ +if (btr_cur_limit_optimistic_insert_debug > 1\ + && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\ + CODE;\ +} +#else +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE) +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the page cursor component of a tree cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(&((btr_cur_t*) cursor)->page_cur); +} + +/*********************************************************//** +Returns the buffer block on which the tree cursor is positioned. +@return pointer to buffer block */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_cur_get_block(btr_cur_get_page_cur(cursor))); +} + +/*********************************************************//** +Returns the record pointer of a tree cursor. +@return pointer to record */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_cur_get_rec(btr_cur_get_page_cur(cursor))); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(buf_block_get_page_zip(btr_cur_get_block(cursor))); +} + +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_align(page_cur_get_rec(&(cursor->page_cur)))); +} + +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor) /*!< out: cursor */ +{ + ut_ad(page_align(rec) == block->frame); + + page_cur_position(rec, block, btr_cur_get_page_cur(cursor)); + + cursor->index = index; +} + +/*********************************************************************//** +Checks if compressing an index page where a btr cursor is placed makes +sense. +@return TRUE if compression is recommended */ +UNIV_INLINE +ibool +btr_cur_compress_recommendation( +/*============================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page; + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U, + return(FALSE)); + + if (!page_has_siblings(page) + || page_get_data_size(page) + < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) { + + /* The page fillfactor has dropped below a predefined + minimum value OR the level in the B-tree contains just + one page: we recommend compression if this is not the + root page. */ + + return cursor->index->page + != btr_cur_get_block(cursor)->page.id().page_no(); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if the record on which the cursor is placed can be deleted without +making tree compression necessary (or, recommended). +@return TRUE if can be deleted without recommended compression */ +UNIV_INLINE +ibool +btr_cur_can_delete_without_compress( +/*================================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + if (!page_has_siblings(page) || page_get_n_recs(page) < 2 + || page_get_data_size(page) - rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) { + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + return cursor->index->page + == btr_cur_get_block(cursor)->page.id().page_no(); + } + + return(TRUE); +} + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ +{ + switch (op) { + case BTR_STORE_INSERT: + case BTR_STORE_INSERT_BULK: + return(FALSE); + case BTR_STORE_INSERT_UPDATE: + case BTR_STORE_UPDATE: + return(TRUE); + } + + ut_ad(0); + return(FALSE); +} diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h new file mode 100644 index 00000000..a9212db0 --- /dev/null +++ b/storage/innobase/include/btr0defragment.h @@ -0,0 +1,75 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef btr0defragment_h +#define btr0defragment_h + +#include "btr0pcur.h" + +/* Max number of pages to consider at once during defragmentation. */ +#define BTR_DEFRAGMENT_MAX_N_PAGES 32 + +/** stats in btr_defragment */ +extern Atomic_counter<ulint> btr_defragment_compression_failures; +extern Atomic_counter<ulint> btr_defragment_failures; +extern Atomic_counter<ulint> btr_defragment_count; + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init(void); +/******************************************************************//** +Shutdown defragmentation. */ +void +btr_defragment_shutdown(); +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. */ +bool +btr_defragment_find_index( + dict_index_t* index); /*!< Index to find. */ +/******************************************************************//** +Add an index to btr_defragment_wq. Return a pointer to os_event if this +is a synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + dberr_t* err); /*!< out: error code */ +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table); /*!< Index to be removed. */ +/******************************************************************//** +Mark an index as removed from btr_defragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index); /*!< Index to be removed. */ +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage.*/ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index); /*!< in: index */ + +/* Stop defragmentation.*/ +void btr_defragment_end(); +extern bool btr_defragment_active; +#endif diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h new file mode 100644 index 00000000..7facea7b --- /dev/null +++ b/storage/innobase/include/btr0pcur.h @@ -0,0 +1,546 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.h +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#ifndef btr0pcur_h +#define btr0pcur_h + +#include "dict0dict.h" +#include "btr0cur.h" +#include "buf0block_hint.h" +#include "btr0btr.h" +#include "gis0rtree.h" + +/* Relative positions for a stored cursor position */ +enum btr_pcur_pos_t { + BTR_PCUR_ON = 1, + BTR_PCUR_BEFORE = 2, + BTR_PCUR_AFTER = 3, +/* Note that if the tree is not empty, btr_pcur_store_position does not +use the following, but only uses the above three alternatives, where the +position is stored relative to a specific record: this makes implementation +of a scroll cursor easier */ + BTR_PCUR_BEFORE_FIRST_IN_TREE = 4, /* in an empty tree */ + BTR_PCUR_AFTER_LAST_IN_TREE = 5 /* in an empty tree */ +}; + +/**************************************************************//** +Allocates memory for a persistent cursor object and initializes the cursor. +@return own: persistent cursor */ +btr_pcur_t* +btr_pcur_create_for_mysql(void); +/*============================*/ + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor);/*!< in, out: persistent cursor */ + +/**************************************************************//** +Frees the memory for a persistent cursor object. */ +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor); /*!< in, own: persistent cursor */ +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is + copied */ +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur); /*!< in: persistent cursor */ + +/** Free old_rec_buf. +@param[in] pcur Persistent cursor holding old_rec to be freed. */ +UNIV_INLINE +void +btr_pcur_free( + btr_pcur_t* pcur); + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +dberr_t +btr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + ib_uint64_t autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written + (0 if none) */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open(i,t,md,l,c,m) \ + btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,0,m) +/**************************************************************//** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +dberr_t +btr_pcur_open_with_no_init_func( +/*============================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; + NOTE that if ahi_latch then we might not + acquire a cursor page latch, but assume + that the ahi_latch protects the record! */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: adaptive hash index latch held + by the caller, or NULL if none */ +#endif /* BTR_CUR_HASH_ADAPT */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#ifdef BTR_CUR_HASH_ADAPT +# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,ahi,__FILE__,__LINE__,m) +#else /* BTR_CUR_HASH_ADAPT */ +# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,__FILE__,__LINE__,m) +#endif /* BTR_CUR_HASH_ADAPT */ + +/*****************************************************************//** +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +dberr_t +btr_pcur_open_at_index_side( +/*========================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +void +btr_pcur_open_on_user_rec_func( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \ + btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m) +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. +@return true if the index is available and we have put the cursor, false +if the index is unavailable */ +UNIV_INLINE +bool +btr_pcur_open_at_rnd_pos_func( +/*==========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in/out: B-tree pcur */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_at_rnd_pos(i,l,c,m) \ + btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/**************************************************************//** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. +@return TRUE if the cursor position was stored when it was on a user +record and it can be restored on a user record whose ordering fields +are identical to the ones of the original user record */ +ibool +btr_pcur_restore_position_func( +/*===========================*/ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: detached persistent cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_restore_position(l,cur,mtr) \ + btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr) +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr to commit */ + +/** Commits the mtr and sets the clustered index pcur and secondary index +pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached. +Function btr_pcur_store_position should be used for both cursor before +calling this, if restoration of cursor is wanted later. +@param[in] pcur persistent cursor +@param[in] sec_pcur secondary index persistent cursor +@param[in] mtr mtr to commit */ +UNIV_INLINE +void +btr_pcurs_commit_specify_mtr( + btr_pcur_t* pcur, + btr_pcur_t* sec_pcur, + mtr_t* mtr); + +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +ibool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. +Releases the latch on the current page, and bufferunfixes it. +Note that there must not be modifications on the current page, +as then the x-latch can be released only in mtr_commit. */ +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr); /*!< in: mtr */ +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the btr cursor component of a persistent cursor. +@return pointer to btr cursor component */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/*********************************************************//** +Returns the page cursor component of a persistent cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/*********************************************************//** +Returns the page of a persistent cursor. +@return pointer to the page */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Returns the buffer block of a persistent cursor. +@return pointer to the block */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Returns the record of a persistent cursor. +@return pointer to the record */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +#else /* UNIV_DEBUG */ +# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur) +# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur) +# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame) +# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block) +# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec) +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor); +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor); +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor); /*!< in/out: persistent cursor */ + +/** Position state of persistent B-tree cursor. */ +enum pcur_pos_t { + /** The persistent cursor is not positioned. */ + BTR_PCUR_NOT_POSITIONED = 0, + /** The persistent cursor was previously positioned. + TODO: currently, the state can be BTR_PCUR_IS_POSITIONED, + though it really should be BTR_PCUR_WAS_POSITIONED, + because we have no obligation to commit the cursor with + mtr; similarly latch_mode may be out of date. This can + lead to problems if btr_pcur is not used the right way; + all current code should be ok. */ + BTR_PCUR_WAS_POSITIONED, + /** The persistent cursor is positioned by optimistic get to the same + record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON. + It may need adjustment depending on previous/current search direction + and rel_pos. */ + BTR_PCUR_IS_POSITIONED_OPTIMISTIC, + /** The persistent cursor is positioned by index search. + Or optimistic get for rel_pos == BTR_PCUR_ON. */ + BTR_PCUR_IS_POSITIONED +}; + +/* The persistent B-tree cursor structure. This is used mainly for SQL +selects, updates, and deletes. */ + +struct btr_pcur_t{ + /** a B-tree cursor */ + btr_cur_t btr_cur; + /** see TODO note below! + BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES, + depending on the latching state of the page and tree where the cursor + is positioned; BTR_NO_LATCHES means that the cursor is not currently + positioned: + we say then that the cursor is detached; it can be restored to + attached if the old position was stored in old_rec */ + ulint latch_mode; + /** true if old_rec is stored */ + bool old_stored; + /** if cursor position is stored, contains an initial segment of the + latest record cursor was positioned either on, before or after */ + rec_t* old_rec; + /** btr_cur.index->n_core_fields when old_rec was copied */ + uint16 old_n_core_fields; + /** number of fields in old_rec */ + uint16 old_n_fields; + /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on + whether cursor was on, before, or after the old_rec record */ + enum btr_pcur_pos_t rel_pos; + /** buffer block when the position was stored */ + buf::Block_hint block_when_stored; + /** the modify clock value of the buffer block when the cursor position + was stored */ + ib_uint64_t modify_clock; + /** btr_pcur_store_position() and btr_pcur_restore_position() state. */ + enum pcur_pos_t pos_state; + /** PAGE_CUR_G, ... */ + page_cur_mode_t search_mode; + /** the transaction, if we know it; otherwise this field is not defined; + can ONLY BE USED in error prints in fatal assertion failures! */ + trx_t* trx_if_known; + /*-----------------------------*/ + /* NOTE that the following fields may possess dynamically allocated + memory which should be freed if not needed anymore! */ + + /** NULL, or a dynamically allocated buffer for old_rec */ + byte* old_rec_buf; + /** old_rec_buf size if old_rec_buf is not NULL */ + ulint buf_size; + + btr_pcur_t() : + btr_cur(), latch_mode(RW_NO_LATCH), + old_stored(false), old_rec(NULL), + old_n_fields(0), rel_pos(btr_pcur_pos_t(0)), + block_when_stored(), + modify_clock(0), pos_state(BTR_PCUR_NOT_POSITIONED), + search_mode(PAGE_CUR_UNSUPP), trx_if_known(NULL), + old_rec_buf(NULL), buf_size(0) + { + btr_cur.init(); + } + + /** Return the index of this persistent cursor */ + dict_index_t* index() const { return(btr_cur.index); } +}; + +#include "btr0pcur.ic" + +#endif diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic new file mode 100644 index 00000000..d93da475 --- /dev/null +++ b/storage/innobase/include/btr0pcur.ic @@ -0,0 +1,645 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.ic +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + + +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor); + ut_ad(cursor->old_rec); + ut_ad(cursor->old_stored); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(cursor->rel_pos); +} + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the btr cursor component of a persistent cursor. +@return pointer to btr cursor component */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cur = &cursor->btr_cur; + return((btr_cur_t*) btr_cur); +} + +/*********************************************************//** +Returns the page cursor component of a persistent cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the page of a persistent cursor. +@return pointer to the page */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the buffer block of a persistent cursor. +@return pointer to the block */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the record of a persistent cursor. +@return pointer to the record */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor))); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + ut_ad(btr_cursor->up_match != ULINT_UNDEFINED); + + return(btr_cursor->up_match); +} + +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + ut_ad(btr_cursor->low_match != ULINT_UNDEFINED); + + return(btr_cursor->low_match); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_pcur_is_before_first_on_page(cursor) + || btr_pcur_is_after_last_on_page(cursor)) { + + return(FALSE); + } + + return(TRUE); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return !page_has_prev(btr_pcur_get_page(cursor)) + && page_cur_is_before_first(btr_pcur_get_page_cur(cursor)); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return !page_has_next(btr_pcur_get_page(cursor)) + && page_cur_is_after_last(btr_pcur_get_page_cur(cursor)); +} + +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_next(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = false; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_prev(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = false; +} + +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + cursor->old_stored = false; +loop: + if (btr_pcur_is_after_last_on_page(cursor)) { + if (btr_pcur_is_after_last_in_tree(cursor)) { + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + } else { + btr_pcur_move_to_next_on_page(cursor); + } + + if (btr_pcur_is_on_user_rec(cursor)) { + + return(TRUE); + } + + goto loop; +} + +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = false; + + if (btr_pcur_is_after_last_on_page(cursor)) { + if (btr_pcur_is_after_last_in_tree(cursor)) { + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + return(TRUE); + } + + btr_pcur_move_to_next_on_page(cursor); + return(TRUE); +} + +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr to commit */ +{ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/** Commits the mtr and sets the clustered index pcur and secondary index +pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached. +Function btr_pcur_store_position should be used for both cursor before +calling this, if restoration of cursor is wanted later. +@param[in] pcur persistent cursor +@param[in] sec_pcur secondary index persistent cursor +@param[in] mtr mtr to commit */ +UNIV_INLINE +void +btr_pcurs_commit_specify_mtr( + btr_pcur_t* pcur, + btr_pcur_t* sec_pcur, + mtr_t* mtr) +{ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(sec_pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + sec_pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; + sec_pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur) /*!< in: persistent cursor */ +{ + pcur->old_stored = false; + pcur->old_rec_buf = NULL; + pcur->old_rec = NULL; + + pcur->btr_cur.rtr_info = NULL; +} + +/** Free old_rec_buf. +@param[in] pcur Persistent cursor holding old_rec to be freed. */ +UNIV_INLINE +void +btr_pcur_free( + btr_pcur_t* pcur) +{ + ut_free(pcur->old_rec_buf); +} + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +dberr_t +btr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + ib_uint64_t autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written + (0 if none) */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t* btr_cursor; + dberr_t err = DB_SUCCESS; + + /* Initialize the cursor */ + + btr_pcur_init(cursor); + + cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + ut_ad(!dict_index_is_spatial(index)); + + err = btr_cur_search_to_nth_level_func( + index, level, tuple, mode, latch_mode, btr_cursor, +#ifdef BTR_CUR_HASH_ADAPT + NULL, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr, autoinc); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + ib::warn() << "btr_pcur_open_low" + << " level: " << level + << " called from file: " + << file << " line: " << line + << " table: " << index->table->name + << " index: " << index->name + << " error: " << err; + } + + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->trx_if_known = NULL; + + return(err); +} + +/**************************************************************//** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +dberr_t +btr_pcur_open_with_no_init_func( +/*============================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; + NOTE that if ahi_latch then we might not + acquire a cursor page latch, but assume + that the ahi_latch protects the record! */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: adaptive hash index latch held + by the caller, or NULL if none */ +#endif /* BTR_CUR_HASH_ADAPT */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t* btr_cursor; + dberr_t err = DB_SUCCESS; + + cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode); + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + err = btr_cur_search_to_nth_level_func( + index, 0, tuple, mode, latch_mode, btr_cursor, +#ifdef BTR_CUR_HASH_ADAPT + ahi_latch, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr); + + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->old_stored = false; + + cursor->trx_if_known = NULL; + return err; +} + +/*****************************************************************//** +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +dberr_t +btr_pcur_open_at_index_side( +/*========================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dberr_t err = DB_SUCCESS; + + pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L; + + if (init_pcur) { + btr_pcur_init(pcur); + } + + err = btr_cur_open_at_index_side( + from_left, index, latch_mode, + btr_pcur_get_btr_cur(pcur), level, mtr); + pcur->pos_state = BTR_PCUR_IS_POSITIONED; + + pcur->old_stored = false; + + pcur->trx_if_known = NULL; + + return (err); +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. +@return true if the index is available and we have put the cursor, false +if the index is unavailable */ +UNIV_INLINE +bool +btr_pcur_open_at_rnd_pos_func( +/*==========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in/out: B-tree pcur */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + /* Initialize the cursor */ + + cursor->latch_mode = latch_mode; + cursor->search_mode = PAGE_CUR_G; + + btr_pcur_init(cursor); + + bool available; + + available = btr_cur_open_at_rnd_pos_func(index, latch_mode, + btr_pcur_get_btr_cur(cursor), + file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->old_stored = false; + + cursor->trx_if_known = NULL; + + return(available); +} + +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_free(cursor->old_rec_buf); + + if (cursor->btr_cur.rtr_info) { + rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true); + cursor->btr_cur.rtr_info = NULL; + } + + cursor->old_rec = NULL; + cursor->old_rec_buf = NULL; + cursor->btr_cur.page_cur.rec = NULL; + cursor->btr_cur.page_cur.block = NULL; + + cursor->old_rec = NULL; + cursor->old_stored = false; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + cursor->trx_if_known = NULL; +} + +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_before_first(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = false; +} diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h new file mode 100644 index 00000000..1e6b667c --- /dev/null +++ b/storage/innobase/include/btr0sea.h @@ -0,0 +1,392 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.h +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0sea_h +#define btr0sea_h + +#include "dict0dict.h" +#ifdef BTR_CUR_HASH_ADAPT +#include "ha0ha.h" +#include "sync0sync.h" + +#define btr_search_sys_create() btr_search_sys.create() +#define btr_search_sys_free() btr_search_sys.free() + +/** Disable the adaptive hash search system and empty the index. */ +void btr_search_disable(); + +/** Enable the adaptive hash search system. +@param resize whether buf_pool_t::resize() is the caller */ +void btr_search_enable(bool resize= false); + +/*********************************************************************//** +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor);/*!< in: cursor which was just positioned */ + +/** Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@param[in,out] index index +@param[in,out] info index search info +@param[in] tuple logical record +@param[in] mode PAGE_CUR_L, .... +@param[in] latch_mode BTR_SEARCH_LEAF, ...; + NOTE that only if has_search_latch is 0, we will + have a latch set on the cursor page, otherwise + we assume the caller uses his search latch + to protect the record! +@param[out] cursor tree cursor +@param[in] ahi_latch the adaptive hash index latch being held, + or NULL +@param[in] mtr mini transaction +@return whether the search succeeded */ +bool +btr_search_guess_on_hash( + dict_index_t* index, + btr_search_t* info, + const dtuple_t* tuple, + ulint mode, + ulint latch_mode, + btr_cur_t* cursor, + rw_lock_t* ahi_latch, + mtr_t* mtr); + +/** Move or delete hash entries for moved records, usually in a page split. +If new_block is already hashed, then any hash index for block is dropped. +If new_block is not hashed, and block is hashed, then a new hash index is +built to new_block with the same parameters as block. +@param[in,out] new_block destination page +@param[in,out] block source page (subject to deletion later) */ +void +btr_search_move_or_delete_hash_entries( + buf_block_t* new_block, + buf_block_t* block); + +/** Drop any adaptive hash index entries that point to an index page. +@param[in,out] block block containing index page, s- or x-latched, or an + index page for which we know that + block->buf_fix_count == 0 or it is an index page which + has already been removed from the buf_pool.page_hash + i.e.: it is in state BUF_BLOCK_REMOVE_HASH */ +void btr_search_drop_page_hash_index(buf_block_t* block); + +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. +@param[in] page_id page id */ +void btr_search_drop_page_hash_when_freed(const page_id_t page_id); + +/** Updates the page hash index when a single record is inserted on a page. +@param[in] cursor cursor which was positioned to the place to insert + using btr_cur_search_, and the new record has been + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ +void +btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch); + +/** Updates the page hash index when a single record is inserted on a page. +@param[in,out] cursor cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ +void +btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch); + +/** Updates the page hash index when a single record is deleted from a page. +@param[in] cursor cursor which was positioned on the record to delete + using btr_cur_search_, the record is not yet deleted.*/ +void btr_search_update_hash_on_delete(btr_cur_t* cursor); + +/** Validates the search system. +@return true if ok */ +bool btr_search_validate(); + +/** Lock all search latches in exclusive mode. */ +static inline void btr_search_x_lock_all(); + +/** Unlock all search latches from exclusive mode. */ +static inline void btr_search_x_unlock_all(); + +/** Lock all search latches in shared mode. */ +static inline void btr_search_s_lock_all(); + +#ifdef UNIV_DEBUG +/** Check if thread owns all the search latches. +@param[in] mode lock mode check +@retval true if owns all of them +@retval false if does not own some of them */ +static inline bool btr_search_own_all(ulint mode); + +/** Check if thread owns any of the search latches. +@param[in] mode lock mode check +@retval true if owns any of them +@retval false if owns no search latch */ +static inline bool btr_search_own_any(ulint mode); + +/** @return whether this thread holds any of the search latches */ +static inline bool btr_search_own_any(); +#endif /* UNIV_DEBUG */ + +/** Unlock all search latches from shared mode. */ +static inline void btr_search_s_unlock_all(); + +#else /* BTR_CUR_HASH_ADAPT */ +# define btr_search_sys_create() +# define btr_search_sys_free() +# define btr_search_drop_page_hash_index(block) +# define btr_search_s_lock_all(index) +# define btr_search_s_unlock_all(index) +# define btr_search_info_update(index, cursor) +# define btr_search_move_or_delete_hash_entries(new_block, block) +# define btr_search_update_hash_on_insert(cursor, ahi_latch) +# define btr_search_update_hash_on_delete(cursor) +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef BTR_CUR_ADAPT +/** Create and initialize search info. +@param[in,out] heap heap where created +@return own: search info struct */ +static inline btr_search_t* btr_search_info_create(mem_heap_t* heap) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** @return the search info of an index */ +static inline btr_search_t* btr_search_get_info(dict_index_t* index) +{ + return(index->search_info); +} +#endif /* BTR_CUR_ADAPT */ + +/** The search info struct in an index */ +struct btr_search_t{ + /* @{ The following fields are not protected by any latch. + Unfortunately, this means that they must be aligned to + the machine word, i.e., they cannot be turned into bit-fields. */ + buf_block_t* root_guess;/*!< the root page frame when it was last time + fetched, or NULL */ +#ifdef BTR_CUR_HASH_ADAPT + ulint hash_analysis; /*!< when this exceeds + BTR_SEARCH_HASH_ANALYSIS, the hash + analysis starts; this is reset if no + success noticed */ + ibool last_hash_succ; /*!< TRUE if the last search would have + succeeded, or did succeed, using the hash + index; NOTE that the value here is not exact: + it is not calculated for every search, and the + calculation itself is not always accurate! */ + ulint n_hash_potential; + /*!< number of consecutive searches + which would have succeeded, or did succeed, + using the hash index; + the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */ + /* @} */ + ulint ref_count; /*!< Number of blocks in this index tree + that have search index built + i.e. block->index points to this index. + Protected by search latch except + when during initialization in + btr_search_info_create(). */ + + /*---------------------- @{ */ + uint16_t n_fields; /*!< recommended prefix length for hash search: + number of full fields */ + uint16_t n_bytes; /*!< recommended prefix: number of bytes in + an incomplete field + @see BTR_PAGE_MAX_REC_SIZE */ + bool left_side; /*!< true or false, depending on whether + the leftmost record of several records with + the same prefix should be indexed in the + hash index */ + /*---------------------- @} */ +#ifdef UNIV_SEARCH_PERF_STAT + ulint n_hash_succ; /*!< number of successful hash searches thus + far */ + ulint n_hash_fail; /*!< number of failed hash searches */ + ulint n_patt_succ; /*!< number of successful pattern searches thus + far */ + ulint n_searches; /*!< number of searches */ +#endif /* UNIV_SEARCH_PERF_STAT */ +#endif /* BTR_CUR_HASH_ADAPT */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */ +/** value of btr_search_t::magic_n, used in assertions */ +# define BTR_SEARCH_MAGIC_N 1112765 +#endif /* UNIV_DEBUG */ +}; + +#ifdef BTR_CUR_HASH_ADAPT +/** The hash index system */ +struct btr_search_sys_t +{ + /** Partition of the hash table */ + struct partition + { + /** latches protecting hash_table */ + rw_lock_t latch; + /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */ + hash_table_t table; + /** memory heap for table */ + mem_heap_t *heap; + + char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof(rw_lock_t) - + sizeof(hash_table_t) - sizeof(mem_heap_t)) & + (CPU_LEVEL1_DCACHE_LINESIZE - 1)]; + + void init() + { + memset((void*) this, 0, sizeof *this); + rw_lock_create(btr_search_latch_key, &latch, SYNC_SEARCH_SYS); + } + + void alloc(ulint hash_size) + { + table.create(hash_size); + heap= mem_heap_create_typed(std::min<ulong>(4096, + MEM_MAX_ALLOC_IN_BUF / 2 + - MEM_BLOCK_HEADER_SIZE + - MEM_SPACE_NEEDED(0)), + MEM_HEAP_FOR_BTR_SEARCH); + } + + void clear() + { + mem_heap_free(heap); + heap= nullptr; + ut_free(table.array); + } + + void free() + { + rw_lock_free(&latch); + if (heap) + clear(); + } + }; + + /** Partitions of the adaptive hash index */ + partition *parts; + + /** Get an adaptive hash index partition */ + partition *get_part(index_id_t id, ulint space_id) const + { + return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts; + } + + /** Get an adaptive hash index partition */ + partition *get_part(const dict_index_t &index) const + { + ut_ad(!index.table->space || + index.table->space->id == index.table->space_id); + return get_part(ulint(index.id), index.table->space_id); + } + + /** Get the search latch for the adaptive hash index partition */ + rw_lock_t *get_latch(const dict_index_t &index) const + { return &get_part(index)->latch; } + + /** Create and initialize at startup */ + void create() + { + parts= static_cast<partition*>(ut_malloc(btr_ahi_parts * sizeof *parts, + mem_key_ahi)); + for (ulong i= 0; i < btr_ahi_parts; ++i) + parts[i].init(); + if (btr_search_enabled) + btr_search_enable(); + } + + void alloc(ulint hash_size) + { + hash_size/= btr_ahi_parts; + for (ulong i= 0; i < btr_ahi_parts; ++i) + parts[i].alloc(hash_size); + } + + /** Clear when disabling the adaptive hash index */ + void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); } + + /** Free at shutdown */ + void free() + { + if (parts) + { + for (ulong i= 0; i < btr_ahi_parts; ++i) + parts[i].free(); + ut_free(parts); + parts= nullptr; + } + } +}; + +/** The adaptive hash index */ +extern btr_search_sys_t btr_search_sys; + +/** @return number of leaf pages pointed to by the adaptive hash index */ +inline ulint dict_index_t::n_ahi_pages() const +{ + if (!btr_search_enabled) + return 0; + rw_lock_t *latch = &btr_search_sys.get_part(*this)->latch; + rw_lock_s_lock(latch); + ulint ref_count= search_info->ref_count; + rw_lock_s_unlock(latch); + return ref_count; +} + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +extern ulint btr_search_n_succ; +/** Number of failed adaptive hash index lookups */ +extern ulint btr_search_n_hash_fail; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** After change in n_fields or n_bytes in info, this many rounds are waited +before starting the hash analysis again: this is to save CPU time when there +is no hope in building a hash index. */ +#define BTR_SEARCH_HASH_ANALYSIS 17 + +/** Limit of consecutive searches for trying a search shortcut on the search +pattern */ +#define BTR_SEARCH_ON_PATTERN_LIMIT 3 + +/** Limit of consecutive searches for trying a search shortcut using +the hash index */ +#define BTR_SEARCH_ON_HASH_LIMIT 3 + +/** We do this many searches before trying to keep the search latch +over calls from MySQL. If we notice someone waiting for the latch, we +again set this much timeout. This is to reduce contention. */ +#define BTR_SEA_TIMEOUT 10000 +#endif /* BTR_CUR_HASH_ADAPT */ + +#include "btr0sea.ic" + +#endif diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic new file mode 100644 index 00000000..40eb5d86 --- /dev/null +++ b/storage/innobase/include/btr0sea.ic @@ -0,0 +1,160 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.ic +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "dict0mem.h" +#include "btr0cur.h" +#include "buf0buf.h" + +/** Create and initialize search info. +@param[in,out] heap heap where created +@return own: search info struct */ +static inline btr_search_t* btr_search_info_create(mem_heap_t* heap) +{ + btr_search_t* info = static_cast<btr_search_t*>( + mem_heap_zalloc(heap, sizeof(btr_search_t))); + ut_d(info->magic_n = BTR_SEARCH_MAGIC_N); +#ifdef BTR_CUR_HASH_ADAPT + info->n_fields = 1; + info->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ + return(info); +} + +#ifdef BTR_CUR_HASH_ADAPT +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ +void +btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor); + +/*********************************************************************//** +Updates the search info. */ +static inline +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); + + if (dict_index_is_spatial(index) || !btr_search_enabled) { + return; + } + + btr_search_t* info; + info = btr_search_get_info(index); + + info->hash_analysis++; + + if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) { + + /* Do nothing */ + + return; + + } + + ut_ad(cursor->flag != BTR_CUR_HASH); + + btr_search_info_update_slow(info, cursor); +} + +/** Lock all search latches in exclusive mode. */ +static inline void btr_search_x_lock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + rw_lock_x_lock(&btr_search_sys.parts[i].latch); + } +} + +/** Unlock all search latches from exclusive mode. */ +static inline void btr_search_x_unlock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + rw_lock_x_unlock(&btr_search_sys.parts[i].latch); + } +} + +/** Lock all search latches in shared mode. */ +static inline void btr_search_s_lock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + rw_lock_s_lock(&btr_search_sys.parts[i].latch); + } +} + +/** Unlock all search latches from shared mode. */ +static inline void btr_search_s_unlock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + rw_lock_s_unlock(&btr_search_sys.parts[i].latch); + } +} + +#ifdef UNIV_DEBUG +/** Check if thread owns all the search latches. +@param[in] mode lock mode check +@retval true if owns all of them +@retval false if does not own some of them */ +static inline bool btr_search_own_all(ulint mode) +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + if (!rw_lock_own(&btr_search_sys.parts[i].latch, mode)) { + return(false); + } + } + return(true); +} + +/** Check if thread owns any of the search latches. +@param[in] mode lock mode check +@retval true if owns any of them +@retval false if owns no search latch */ +static inline bool btr_search_own_any(ulint mode) +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + if (rw_lock_own(&btr_search_sys.parts[i].latch, mode)) { + return(true); + } + } + return(false); +} + +/** @return whether this thread holds any of the search latches */ +static inline bool btr_search_own_any() +{ + for (ulint i = btr_ahi_parts; i--; ) { + if (rw_lock_own_flagged(&btr_search_sys.parts[i].latch, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)) { + return true; + } + } + return false; +} +#endif /* UNIV_DEBUG */ +#endif /* BTR_CUR_HASH_ADAPT */ diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h new file mode 100644 index 00000000..83c374e2 --- /dev/null +++ b/storage/innobase/include/btr0types.h @@ -0,0 +1,59 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0types.h +The index tree general types + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0types_h +#define btr0types_h + +#include "page0types.h" +#include "rem0types.h" + +/** Persistent cursor */ +struct btr_pcur_t; +/** B-tree cursor */ +struct btr_cur_t; +/** B-tree search information for the adaptive hash index */ +struct btr_search_t; + +#ifdef BTR_CUR_HASH_ADAPT +/** Is search system enabled. +Search system is protected by array of latches. */ +extern char btr_search_enabled; + +/** Number of adaptive hash index partition. */ +extern ulong btr_ahi_parts; +#endif /* BTR_CUR_HASH_ADAPT */ + +/** The size of a reference to data stored on a different page. +The reference is stored at the end of the prefix of the field +in the index record. */ +#define FIELD_REF_SIZE 20U +#define BTR_EXTERN_FIELD_REF_SIZE FIELD_REF_SIZE + +/** If the data don't exceed the size, the data are stored locally. */ +#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \ + (BTR_EXTERN_FIELD_REF_SIZE * 2) + +#endif diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h new file mode 100644 index 00000000..ee48e7ce --- /dev/null +++ b/storage/innobase/include/buf0block_hint.h @@ -0,0 +1,76 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ +#pragma once +#include "buf0buf.h" + +namespace buf { +class Block_hint { +public: + /** Stores the pointer to the block, which is currently buffer-fixed. + @param block a pointer to a buffer-fixed block to be stored */ + inline void store(buf_block_t *block) + { + ut_ad(block->page.buf_fix_count()); + m_block= block; + m_page_id= block->page.id(); + } + + /** Clears currently stored pointer. */ + inline void clear() { m_block= nullptr; } + + /** Invoke f on m_block(which may be null) + @param f The function to be executed. It will be passed the pointer. + If you wish to use the block pointer subsequently, + you need to ensure you buffer-fix it before returning from f. + @return the return value of f + */ + template <typename F> + bool run_with_hint(const F &f) + { + buffer_fix_block_if_still_valid(); + /* m_block could be changed during f() call, so we use local + variable to remember which block we need to unfix */ + buf_block_t *block= m_block; + bool res= f(block); + if (block) + buf_block_buf_fix_dec(block); + return res; + } + + buf_block_t *block() const { return m_block; } + + private: + /** The block pointer stored by store(). */ + buf_block_t *m_block= nullptr; + /** If m_block is non-null, the m_block->page.id at time it was stored. */ + page_id_t m_page_id{0, 0}; + + /** A helper function which checks if m_block is not a dangling pointer and + still points to block with page with m_page_id and if so, buffer-fixes it, + otherwise clear()s it */ + void buffer_fix_block_if_still_valid(); +}; +} // namespace buf diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h new file mode 100644 index 00000000..cba31074 --- /dev/null +++ b/storage/innobase/include/buf0buddy.h @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buddy.h +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifndef buf0buddy_h +#define buf0buddy_h + +#include "buf0types.h" + +/** +@param[in] block size in bytes +@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +inline +ulint +buf_buddy_get_slot(ulint size) +{ + ulint i; + ulint s; + + ut_ad(ut_is_2pow(size)); + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= srv_page_size); + + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { + } + ut_ad(i <= BUF_BUDDY_SIZES); + return i; +} + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc)); + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param size compressed page size in bytes +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr) +{ + return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru); +} + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +void buf_buddy_free_low(void* buf, ulint i); + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] size block size in bytes */ +inline void buf_buddy_free(void* buf, ulint size) +{ + buf_buddy_free_low(buf, buf_buddy_get_slot(size)); +} + +/** Try to reallocate a block. +@param[in] buf block to be reallocated, must be pointed +to by the buffer pool +@param[in] size block size, up to srv_page_size +@retval false if failed because of no free blocks. */ +bool buf_buddy_realloc(void* buf, ulint size); + +/** Combine all pairs of free buddies. */ +void buf_buddy_condense_free(); + +#endif /* buf0buddy_h */ diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h new file mode 100644 index 00000000..5a118df4 --- /dev/null +++ b/storage/innobase/include/buf0buf.h @@ -0,0 +1,2456 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.h +The database buffer pool high-level routines + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0buf_h +#define buf0buf_h + +/** Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL + +#include "fil0fil.h" +#include "mtr0types.h" +#include "buf0types.h" +#include "span.h" +#include "assume_aligned.h" +#ifndef UNIV_INNOCHECKSUM +#include "hash0hash.h" +#include "ut0byte.h" +#include "page0types.h" +#include "log0log.h" +#include "srv0srv.h" +#include <ostream> + +// Forward declaration +struct fil_addr_t; + +/** @name Modes for buf_page_get_gen */ +/* @{ */ +#define BUF_GET 10 /*!< get always */ +#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ +#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but + set no latch; we have + separated this case, because + it is error-prone programming + not to set a latch, and it + should be used with care */ +#define BUF_GET_IF_IN_POOL_OR_WATCH 15 + /*!< Get the page only if it's in the + buffer pool, if not then set a watch + on the page. */ +#define BUF_GET_POSSIBLY_FREED 16 + /*!< Like BUF_GET, but do not mind + if the file page has been freed. */ +#define BUF_EVICT_IF_IN_POOL 20 /*!< evict a clean block if found */ +/* @} */ + +# ifdef UNIV_DEBUG +extern my_bool buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing + buffer pool is not allowed. */ +# endif /* UNIV_DEBUG */ + +/** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */ +enum buf_page_state +{ + /** available in buf_pool.free or buf_pool.watch */ + BUF_BLOCK_NOT_USED, + /** allocated for something else than a file page */ + BUF_BLOCK_MEMORY, + /** a previously allocated file page, in transit to NOT_USED */ + BUF_BLOCK_REMOVE_HASH, + /** a buf_block_t that is also in buf_pool.LRU */ + BUF_BLOCK_FILE_PAGE, + /** the buf_page_t of a ROW_FORMAT=COMPRESSED page + whose uncompressed page frame has been evicted */ + BUF_BLOCK_ZIP_PAGE +}; + +/** This structure defines information we will fetch from each buffer pool. It +will be used to print table IO stats */ +struct buf_pool_info_t +{ + /* General buffer pool info */ + ulint pool_size; /*!< Buffer Pool size in pages */ + ulint lru_len; /*!< Length of buf_pool.LRU */ + ulint old_lru_len; /*!< buf_pool.LRU_old_len */ + ulint free_list_len; /*!< Length of buf_pool.free list */ + ulint flush_list_len; /*!< Length of buf_pool.flush_list */ + ulint n_pend_unzip; /*!< buf_pool.n_pend_unzip, pages + pending decompress */ + ulint n_pend_reads; /*!< buf_pool.n_pend_reads, pages + pending read */ + ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */ + ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH + LIST */ + ulint n_pages_made_young; /*!< number of pages made young */ + ulint n_pages_not_made_young; /*!< number of pages not made young */ + ulint n_pages_read; /*!< buf_pool.n_pages_read */ + ulint n_pages_created; /*!< buf_pool.n_pages_created */ + ulint n_pages_written; /*!< buf_pool.n_pages_written */ + ulint n_page_gets; /*!< buf_pool.n_page_gets */ + ulint n_ra_pages_read_rnd; /*!< buf_pool.n_ra_pages_read_rnd, + number of pages readahead */ + ulint n_ra_pages_read; /*!< buf_pool.n_ra_pages_read, number + of pages readahead */ + ulint n_ra_pages_evicted; /*!< buf_pool.n_ra_pages_evicted, + number of readahead pages evicted + without access */ + ulint n_page_get_delta; /*!< num of buffer pool page gets since + last printout */ + + /* Buffer pool access stats */ + double page_made_young_rate; /*!< page made young rate in pages + per second */ + double page_not_made_young_rate;/*!< page not made young rate + in pages per second */ + double pages_read_rate; /*!< num of pages read per second */ + double pages_created_rate; /*!< num of pages create per second */ + double pages_written_rate; /*!< num of pages written per second */ + ulint page_read_delta; /*!< num of pages read since last + printout */ + ulint young_making_delta; /*!< num of pages made young since + last printout */ + ulint not_young_making_delta; /*!< num of pages not make young since + last printout */ + + /* Statistics about read ahead algorithm. */ + double pages_readahead_rnd_rate;/*!< random readahead rate in pages per + second */ + double pages_readahead_rate; /*!< readahead rate in pages per + second */ + double pages_evicted_rate; /*!< rate of readahead page evicted + without access, in pages per second */ + + /* Stats about LRU eviction */ + ulint unzip_lru_len; /*!< length of buf_pool.unzip_LRU + list */ + /* Counters for LRU policy */ + ulint io_sum; /*!< buf_LRU_stat_sum.io */ + ulint io_cur; /*!< buf_LRU_stat_cur.io, num of IO + for current interval */ + ulint unzip_sum; /*!< buf_LRU_stat_sum.unzip */ + ulint unzip_cur; /*!< buf_LRU_stat_cur.unzip, num + pages decompressed in current + interval */ +}; +#endif /* !UNIV_INNOCHECKSUM */ + +/** Print the given page_id_t object. +@param[in,out] out the output stream +@param[in] page_id the page_id_t object to be printed +@return the output stream */ +std::ostream& +operator<<( + std::ostream& out, + const page_id_t page_id); + +#ifndef UNIV_INNOCHECKSUM +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void); +/*========================*/ + +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ + MY_ATTRIBUTE((malloc)); +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ + MY_ATTRIBUTE((nonnull)); + +/** Allocate a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +inline buf_block_t *buf_block_alloc(); +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block); /*!< in, own: block to be freed */ + +/**************************************************************//** +NOTE! The following macros should be used instead of buf_page_get_gen, +to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed +in LA! */ +#define buf_page_get(ID, SIZE, LA, MTR) \ + buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR) + +/**************************************************************//** +Use these macros to bufferfix a page with no latching. Remember not to +read the contents of the page unless you know it is safe. Do not modify +the contents of the page! We have separated this case, because it is +error-prone programming not to set a latch, and it should be used +with care. */ +#define buf_page_get_with_no_latch(ID, SIZE, MTR) \ + buf_page_get_gen(ID, SIZE, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, \ + __FILE__, __LINE__, MTR) +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +ibool +buf_page_optimistic_get( +/*====================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: guessed block */ + ib_uint64_t modify_clock,/*!< in: modify clock value */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/** Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the lock_sys_t::mutex. +@param[in] page_id page id +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@return pointer to a page or NULL */ +buf_block_t* +buf_page_try_get_func( + const page_id_t page_id, + const char* file, + unsigned line, + mtr_t* mtr); + +/** Tries to get a page. +If the page is not in the buffer pool it is not loaded. Suitable for using +when holding the lock_sys_t::mutex. +@param[in] page_id page identifier +@param[in] mtr mini-transaction +@return the page if in buffer pool, NULL if not */ +#define buf_page_try_get(page_id, mtr) \ + buf_page_try_get_func((page_id), __FILE__, __LINE__, mtr); + +/** Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size +@return pointer to the block */ +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size); + +/** Get access to a database page. Buffered redo log may be applied. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge while +reading the pages from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_gen( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + const char* file, + unsigned line, + mtr_t* mtr, + dberr_t* err = NULL, + bool allow_ibuf_merge = false); + +/** This is the low level function used to get access to a database page. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge to happen +while reading the page from file +then it makes sure that it does merging of change buffer changes while +reading the page from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_low( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + const char* file, + unsigned line, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge); + +/** Initialize a page in the buffer pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@param[in,out] space space object +@param[in] offset offset of the tablespace +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[in,out] free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create(fil_space_t *space, uint32_t offset, + ulint zip_size, mtr_t *mtr, buf_block_t *free_block); + +/********************************************************************//** +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage); /*!< in: buffer block */ +/********************************************************************//** +Releases a latch, if specified. */ +UNIV_INLINE +void +buf_page_release_latch( +/*=====================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ +/** Move a block to the start of the LRU list. */ +void buf_page_make_young(buf_page_t *bpage); +/** Mark the page status as FREED for the given tablespace id and +page number. If the page is not in buffer pool then ignore it. +@param[in,out] space tablespace +@param[in] page page number +@param[in,out] mtr mini-transaction +@param[in] file file name +@param[in] line line where called */ +void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr, + const char *file, unsigned line); + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +unsigned +buf_page_get_freed_page_clock( +/*==========================*/ + const buf_page_t* bpage) /*!< in: block */ + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +unsigned +buf_block_get_freed_page_clock( +/*===========================*/ + const buf_block_t* block) /*!< in: block */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Determine if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@param[in] bpage buffer pool page +@return whether bpage is close to MRU end of LRU */ +inline bool buf_page_peek_if_young(const buf_page_t *bpage); + +/** Determine if a block should be moved to the start of the LRU list if +there is danger of dropping from the buffer pool. +@param[in] bpage buffer pool page +@return true if bpage should be made younger */ +inline bool buf_page_peek_if_too_old(const buf_page_t *bpage); + +/** Move a page to the start of the buffer pool LRU list if it is too old. +@param[in,out] bpage buffer pool page */ +inline void buf_page_make_young_if_needed(buf_page_t *bpage) +{ + if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) { + buf_page_make_young(bpage); + } +} + +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +# ifdef UNIV_DEBUG + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line */ +# endif /* UNIV_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ + MY_ATTRIBUTE((nonnull)); + +# ifdef UNIV_DEBUG +/** Increments the bufferfix count. +@param[in,out] b block to bufferfix +@param[in] f file name where requested +@param[in] l line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) +# else /* UNIV_DEBUG */ +/** Increments the bufferfix count. +@param[in,out] b block to bufferfix +@param[in] f file name where requested +@param[in] l line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_INNOCHECKSUM */ + +/** Check if a buffer is all zeroes. +@param[in] buf data to check +@return whether the buffer is all zeroes */ +bool buf_is_zeroes(st_::span<const byte> buf); + +/** Checks if the page is in crc32 checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in crc32 checksum format. */ +bool +buf_page_is_checksum_valid_crc32( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Checks if the page is in innodb checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in innodb checksum format. */ +bool +buf_page_is_checksum_valid_innodb( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Checks if the page is in none checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in none checksum format. */ +bool +buf_page_is_checksum_valid_none( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Check if a page is corrupt. +@param[in] check_lsn whether the LSN should be checked +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return whether the page is corrupted */ +bool +buf_page_is_corrupted( + bool check_lsn, + const byte* read_buf, + ulint fsp_flags) + MY_ATTRIBUTE((warn_unused_result)); + +inline void *aligned_malloc(size_t size, size_t align) +{ +#ifdef _MSC_VER + return _aligned_malloc(size, align); +#else + void *result; + if (posix_memalign(&result, align, size)) + result= NULL; + return result; +#endif +} + +inline void aligned_free(void *ptr) +{ +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +/** Read the key version from the page. In full crc32 format, +key version is stored at {0-3th} bytes. In other format, it is +stored in 26th position. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return key version of the page. */ +inline uint32_t buf_page_get_key_version(const byte* read_buf, ulint fsp_flags) +{ + static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility"); + return fil_space_t::full_crc32(fsp_flags) + ? mach_read_from_4(my_assume_aligned<4>(read_buf)) + : mach_read_from_4(my_assume_aligned<2> + (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)); +} + +/** Read the compression info from the page. In full crc32 format, +compression info is at MSB of page type. In other format, it is +stored in page type. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return true if page is compressed. */ +inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags) +{ + uint16_t page_type= fil_page_get_type(read_buf); + return fil_space_t::full_crc32(fsp_flags) + ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + : page_type == FIL_PAGE_PAGE_COMPRESSED; +} + +/** Get the compressed or uncompressed size of a full_crc32 page. +@param[in] buf page_compressed or uncompressed page +@param[out] comp whether the page could be compressed +@param[out] cr whether the page could be corrupted +@return the payload size in the file page */ +inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr) +{ + uint t = fil_page_get_type(buf); + uint page_size = uint(srv_page_size); + + if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) { + return page_size; + } + + t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER); + t <<= 8; + + if (t < page_size) { + page_size = t; + if (comp) { + *comp = true; + } + } else if (cr) { + *cr = true; + } + + return page_size; +} + +#ifndef UNIV_INNOCHECKSUM +/** Dump a page to stderr. +@param[in] read_buf database page +@param[in] zip_size compressed page size, or 0 */ +void buf_page_print(const byte* read_buf, ulint zip_size = 0) + ATTRIBUTE_COLD __attribute__((nonnull)); +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check); /*!< in: TRUE=verify the page checksum */ + +#ifdef UNIV_DEBUG +/** @return the number of latched pages in the buffer pool */ +ulint buf_get_latched_pages_number(); +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Prints info of the buffer i/o. */ +void +buf_print_io( +/*=========*/ + FILE* file); /*!< in: file where to print */ +/** Collect buffer pool metadata. +@param[out] pool_info buffer pool metadata */ +void buf_stats_get_pool_info(buf_pool_info_t *pool_info); + +/** Refresh the statistics used to print per-second averages. */ +void buf_refresh_io_stats(); + +/** Invalidate all pages in the buffer pool. +All pages must be in a replaceable state (not modified or latched). */ +void buf_pool_invalidate(); + +/*======================================================================== +--------------------------- LOWER LEVEL ROUTINES ------------------------- +=========================================================================*/ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /*!< in: buffer page + where we have acquired latch */ + latch_level_t level); /*!< in: latching order level */ +#else /* UNIV_DEBUG */ +# define buf_block_dbg_add_level(block, level) /* nothing */ +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets a pointer to the memory frame of a block. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + MY_ATTRIBUTE((warn_unused_result)); +#else /* UNIV_DEBUG */ +# define buf_block_get_frame(block) (block)->frame +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +#define buf_block_get_page_zip(block) \ + (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL) +#define is_buf_block_get_page_zip(block) \ + UNIV_LIKELY_NULL((block)->page.zip.data) + +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param io_type BUF_IO_READ or BUF_IO_WRITE */ +ATTRIBUTE_COLD __attribute__((nonnull)) +void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type); + +/** Complete a read request of a file page to buf_pool. +@param bpage recently read page +@param node data file +@return whether the operation succeeded +@retval DB_SUCCESS always when writing, or if a read page was OK +@retval DB_PAGE_CORRUPTED if the checksum fails on a page read +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */ +dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node); + +/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, +if needed. +@param[in] size size in bytes +@return aligned size */ +UNIV_INLINE +ulint +buf_pool_size_align( + ulint size); + +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param[in] page page frame +@param[in] fsp_flags tablespace flags +@return true if page is encrypted and OK, false otherwise */ +bool buf_page_verify_crypt_checksum( + const byte* page, + ulint fsp_flags); + +/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. +@param[in,out] page page to update +@param[in] size compressed page size */ +void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size); + +/** @brief The temporary memory structure. + +NOTE! The definition appears here only for other modules of this +directory (buf) to see it. Do not use from outside! */ + +class buf_tmp_buffer_t +{ + /** whether this slot is reserved */ + std::atomic<bool> reserved; +public: + /** For encryption, the data needs to be copied to a separate buffer + before it's encrypted&written. The buffer block itself can be replaced + while a write of crypt_buf to file is in progress. */ + byte *crypt_buf; + /** buffer for fil_page_compress(), for flushing page_compressed pages */ + byte *comp_buf; + /** pointer to resulting buffer after encryption or compression; + not separately allocated memory */ + byte *out_buf; + + /** Release the slot */ + void release() { reserved.store(false, std::memory_order_relaxed); } + + /** Acquire the slot + @return whether the slot was acquired */ + bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);} + + /** Allocate a buffer for encryption, decryption or decompression. */ + void allocate() + { + if (!crypt_buf) + crypt_buf= static_cast<byte*> + (aligned_malloc(srv_page_size, srv_page_size)); + } +}; + +/** The common buffer control block structure +for compressed and uncompressed frames */ + +class buf_pool_t; + +class buf_page_t +{ + friend buf_pool_t; + friend buf_block_t; + /** @name General fields */ + /* @{ */ + +public: // FIXME: fix fil_iterate() + /** Page id. Protected by buf_pool.hash_lock_get(id) when + the page is in buf_pool.page_hash. */ + page_id_t id_; +private: + /** Count of how manyfold this block is currently bufferfixed. */ + Atomic_counter<uint32_t> buf_fix_count_; + + /** log sequence number of the START of the log entry written of the + oldest modification to this block which has not yet been written + to the data file; + + 0 if no modifications are pending; + 1 if no modifications are pending, but the block is in buf_pool.flush_list; + 2 if modifications are pending, but the block is not in buf_pool.flush_list + (because id().space() is the temporary tablespace). */ + Atomic_relaxed<lsn_t> oldest_modification_; + + /** type of pending I/O operation; protected by buf_pool.mutex + if in_LRU_list */ + Atomic_relaxed<buf_io_fix> io_fix_; + /** Block state. @see in_file(). + State transitions between in_file() states and to + BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id) + when the block is in buf_pool.page_hash. + Other transitions when in_LRU_list are protected by buf_pool.mutex. */ + buf_page_state state_; + +public: + /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */ + buf_page_t *hash; + /* @} */ + page_zip_des_t zip; /*!< compressed page; zip.data + (but not the data it points to) is + also protected by buf_pool.mutex; + state == BUF_BLOCK_ZIP_PAGE and + zip.data == NULL means an active + buf_pool.watch */ + + buf_tmp_buffer_t* slot; /*!< Slot for temporary memory + used for encryption/compression + or NULL */ +#ifdef UNIV_DEBUG + /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */ + bool in_zip_hash; + /** whether this->LRU is in buf_pool.LRU (in_file() holds); + protected by buf_pool.mutex */ + bool in_LRU_list; + /** whether this is in buf_pool.page_hash (in_file() holds); + protected by buf_pool.mutex */ + bool in_page_hash; + /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED); + protected by buf_pool.flush_list_mutex */ + bool in_free_list; +#endif /* UNIV_DEBUG */ + /** list member in one of the lists of buf_pool; protected by + buf_pool.mutex or buf_pool.flush_list_mutex + + state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw + + in_file() && oldest_modification(): + buf_pool.flush_list (protected by buf_pool.flush_list_mutex) + + The contents is undefined if in_file() && !oldest_modification(), + or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */ + UT_LIST_NODE_T(buf_page_t) list; + + /** @name LRU replacement algorithm fields. + Protected by buf_pool.mutex. */ + /* @{ */ + + UT_LIST_NODE_T(buf_page_t) LRU; + /*!< node of the LRU list */ + unsigned old:1; /*!< TRUE if the block is in the old + blocks in buf_pool.LRU_old */ + unsigned freed_page_clock:31;/*!< the value of + buf_pool.freed_page_clock + when this block was the last + time put to the head of the + LRU list; a thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + /* @} */ + Atomic_counter<unsigned> access_time; /*!< time of first access, or + 0 if the block was never accessed + in the buffer pool. + + For state==BUF_BLOCK_MEMORY + blocks, this field can be repurposed + for something else. + + When this field counts log records + and bytes allocated for recv_sys.pages, + the field is protected by + recv_sys_t::mutex. */ + /** Change buffer entries for the page exist. + Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */ + bool ibuf_exist; + + /** Block initialization status. Can be modified while holding io_fix() + or buf_block_t::lock X-latch */ + enum { + /** the page was read normally and should be flushed normally */ + NORMAL = 0, + /** the page was (re)initialized, and the doublewrite buffer can be + skipped on the next flush */ + INIT_ON_FLUSH, + /** the page was freed and need to be flushed. + For page_compressed, page flush will punch a hole to free space. + Else if innodb_immediate_scrub_data_uncompressed, the page will + be overwritten with zeroes. */ + FREED + } status; + + buf_page_t() : id_(0) + { + static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility"); + memset((void*) this, 0, sizeof *this); + } + + /** Initialize some fields */ + void init() + { + io_fix_= BUF_IO_NONE; + buf_fix_count_= 0; + old= 0; + freed_page_clock= 0; + access_time= 0; + oldest_modification_= 0; + slot= nullptr; + ibuf_exist= false; + status= NORMAL; + ut_d(in_zip_hash= false); + ut_d(in_free_list= false); + ut_d(in_LRU_list= false); + ut_d(in_page_hash= false); + HASH_INVALIDATE(this, hash); + } + + /** Initialize some more fields */ + void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0) + { + init(); + state_= state; + id_= id; + buf_fix_count_= buf_fix_count; + } + + /** Initialize some more fields */ + void init(page_id_t id, uint32_t buf_fix_count= 0) + { + init(); + id_= id; + buf_fix_count_= buf_fix_count; + } + +public: + const page_id_t &id() const { return id_; } + buf_page_state state() const { return state_; } + uint32_t buf_fix_count() const { return buf_fix_count_; } + buf_io_fix io_fix() const { return io_fix_; } + void io_unfix() + { + ut_d(const auto old_io_fix= io_fix()); + ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN); + io_fix_= BUF_IO_NONE; + } + + /** @return if this belongs to buf_pool.unzip_LRU */ + bool belongs_to_unzip_LRU() const + { + return zip.data && state() != BUF_BLOCK_ZIP_PAGE; + } + + inline void add_buf_fix_count(uint32_t count); + inline void set_buf_fix_count(uint32_t count); + inline void set_state(buf_page_state state); + inline void set_io_fix(buf_io_fix io_fix); + inline void set_corrupt_id(); + + /** @return the log sequence number of the oldest pending modification + @retval 0 if the block is being removed from (or not in) buf_pool.flush_list + @retval 1 if the block is in buf_pool.flush_list but not modified + @retval 2 if the block belongs to the temporary tablespace and + has unwritten changes */ + lsn_t oldest_modification() const { return oldest_modification_; } + /** @return the log sequence number of the oldest pending modification, + @retval 0 if the block is definitely not in buf_pool.flush_list + @retval 1 if the block is in buf_pool.flush_list but not modified + @retval 2 if the block belongs to the temporary tablespace and + has unwritten changes */ + lsn_t oldest_modification_acquire() const + { return oldest_modification_.load(std::memory_order_acquire); } + /** Set oldest_modification when adding to buf_pool.flush_list */ + inline void set_oldest_modification(lsn_t lsn); + /** Clear oldest_modification after removing from buf_pool.flush_list */ + inline void clear_oldest_modification(); + /** Note that a block is no longer dirty, while not removing + it from buf_pool.flush_list */ + inline void clear_oldest_modification(bool temporary); + + /** Notify that a page in a temporary tablespace has been modified. */ + void set_temp_modified() + { + ut_ad(fsp_is_system_temporary(id().space())); + ut_ad(state() == BUF_BLOCK_FILE_PAGE); + ut_ad(!oldest_modification()); + oldest_modification_= 2; + } + + /** Prepare to release a file page to buf_pool.free. */ + void free_file_page() + { + ut_ad(state() == BUF_BLOCK_REMOVE_HASH); + /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */ + ut_d(oldest_modification_= 0;) + set_corrupt_id(); + ut_d(set_state(BUF_BLOCK_MEMORY)); + } + + void fix() { buf_fix_count_++; } + uint32_t unfix() + { + uint32_t count= buf_fix_count_--; + ut_ad(count != 0); + return count - 1; + } + + /** @return the physical size, in bytes */ + ulint physical_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size; + } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0; + } + + /** @return the byte offset of the page within a file */ + os_offset_t physical_offset() const + { + os_offset_t o= id().page_no(); + return zip.ssize + ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1)) + : o << srv_page_size_shift; + } + + /** @return whether the block is mapped to a data file */ + bool in_file() const + { + switch (state_) { + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_FILE_PAGE: + return true; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + return false; + } + + ut_error; + return false; + } + + /** @return whether the block is modified and ready for flushing */ + inline bool ready_for_flush() const; + /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */ + bool ready_for_replace() const + { return !oldest_modification() && can_relocate(); } + /** @return whether the block can be relocated in memory. + The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ + inline bool can_relocate() const; + /** @return whether the block has been flagged old in buf_pool.LRU */ + inline bool is_old() const; + /** Set whether a block is old in buf_pool.LRU */ + inline void set_old(bool old); + /** Flag a page accessed in buf_pool + @return whether this is not the first access */ + bool set_accessed() + { + if (is_accessed()) return true; + access_time= static_cast<uint32_t>(ut_time_ms()); + return false; + } + /** @return ut_time_ms() at the time of first access of a block in buf_pool + @retval 0 if not accessed */ + unsigned is_accessed() const { ut_ad(in_file()); return access_time; } +}; + +/** The buffer control block structure */ + +struct buf_block_t{ + + /** @name General fields */ + /* @{ */ + + buf_page_t page; /*!< page information; this must + be the first field, so that + buf_pool.page_hash can point + to buf_page_t or buf_block_t */ + byte* frame; /*!< pointer to buffer frame which + is of size srv_page_size, and + aligned to an address divisible by + srv_page_size */ + rw_lock_t lock; /*!< read-write lock of the buffer + frame */ +#ifdef UNIV_DEBUG + /** whether page.list is in buf_pool.withdraw + ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk; + protected by buf_pool.mutex */ + bool in_withdraw_list; + /** whether unzip_LRU is in buf_pool.unzip_LRU + (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr); + protected by buf_pool.mutex */ + bool in_unzip_LRU_list; +#endif + UT_LIST_NODE_T(buf_block_t) unzip_LRU; + /*!< node of the decompressed LRU list; + a block is in the unzip_LRU list + if page.state() == BUF_BLOCK_FILE_PAGE + and page.zip.data != NULL */ + /* @} */ + /** @name Optimistic search field */ + /* @{ */ + + ib_uint64_t modify_clock; /*!< this clock is incremented every + time a pointer to a record on the + page may become obsolete; this is + used in the optimistic cursor + positioning: if the modify clock has + not changed, we know that the pointer + is still valid; this field may be + changed if the thread (1) owns the + pool mutex and the page is not + bufferfixed, or (2) the thread has an + x-latch on the block */ + /* @} */ +#ifdef BTR_CUR_HASH_ADAPT + /** @name Hash search fields (unprotected) + NOTE that these fields are NOT protected by any semaphore! */ + /* @{ */ + + volatile uint16_t n_bytes; /*!< recommended prefix length for hash + search: number of bytes in + an incomplete last field */ + volatile uint16_t n_fields; /*!< recommended prefix length for hash + search: number of full fields */ + uint16_t n_hash_helps; /*!< counter which controls building + of a new hash index for the page */ + volatile bool left_side; /*!< true or false, depending on + whether the leftmost record of several + records with the same prefix should be + indexed in the hash index */ + /* @} */ + + /** @name Hash search fields + These 5 fields may only be modified when: + we are holding the appropriate x-latch in btr_search_latches[], and + one of the following holds: + (1) the block state is BUF_BLOCK_FILE_PAGE, and + we are holding an s-latch or x-latch on buf_block_t::lock, or + (2) buf_block_t::buf_fix_count == 0, or + (3) the block state is BUF_BLOCK_REMOVE_HASH. + + An exception to this is when we init or create a page + in the buffer pool in buf0buf.cc. + + Another exception for buf_pool_t::clear_hash_index() is that + assigning block->index = NULL (and block->n_pointers = 0) + is allowed whenever btr_search_own_all(RW_LOCK_X). + + Another exception is that ha_insert_for_fold() may + decrement n_pointers without holding the appropriate latch + in btr_search_latches[]. Thus, n_pointers must be + protected by atomic memory access. + + This implies that the fields may be read without race + condition whenever any of the following hold: + - the btr_search_latches[] s-latch or x-latch is being held, or + - the block state is not BUF_BLOCK_FILE_PAGE or BUF_BLOCK_REMOVE_HASH, + and holding some latch prevents the state from changing to that. + + Some use of assert_block_ahi_empty() or assert_block_ahi_valid() + is prone to race conditions while buf_pool_t::clear_hash_index() is + executing (the adaptive hash index is being disabled). Such use + is explicitly commented. */ + + /* @{ */ + +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + Atomic_counter<ulint> + n_pointers; /*!< used in debugging: the number of + pointers in the adaptive hash index + pointing to this frame; + protected by atomic memory access + or btr_search_own_all(). */ +# define assert_block_ahi_empty(block) \ + ut_a((block)->n_pointers == 0) +# define assert_block_ahi_empty_on_init(block) do { \ + MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \ + assert_block_ahi_empty(block); \ +} while (0) +# define assert_block_ahi_valid(block) \ + ut_a((block)->index || (block)->n_pointers == 0) +# else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +# define assert_block_ahi_empty(block) /* nothing */ +# define assert_block_ahi_empty_on_init(block) /* nothing */ +# define assert_block_ahi_valid(block) /* nothing */ +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + unsigned curr_n_fields:10;/*!< prefix length for hash indexing: + number of full fields */ + unsigned curr_n_bytes:15;/*!< number of bytes in hash + indexing */ + unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */ + dict_index_t* index; /*!< Index for which the + adaptive hash index has been + created, or NULL if the page + does not exist in the + index. Note that it does not + guarantee that the index is + complete, though: there may + have been hash collisions, + record deletions, etc. */ + /* @} */ +#else /* BTR_CUR_HASH_ADAPT */ +# define assert_block_ahi_empty(block) /* nothing */ +# define assert_block_ahi_empty_on_init(block) /* nothing */ +# define assert_block_ahi_valid(block) /* nothing */ +#endif /* BTR_CUR_HASH_ADAPT */ +# ifdef UNIV_DEBUG + /** @name Debug fields */ + /* @{ */ + rw_lock_t* debug_latch; /*!< in the debug version, each thread + which bufferfixes the block acquires + an s-latch here; so we can use the + debug utilities in sync0rw */ + /* @} */ +# endif + void fix() { page.fix(); } + uint32_t unfix() + { + ut_ad(page.buf_fix_count() || page.io_fix() != BUF_IO_NONE || + page.state() == BUF_BLOCK_ZIP_PAGE || + !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S | + RW_LOCK_FLAG_SX)); + return page.unfix(); + } + + /** @return the physical size, in bytes */ + ulint physical_size() const { return page.physical_size(); } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const { return page.zip_size(); } + + /** Initialize the block. + @param page_id page identifier + @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 + @param fix initial buf_fix_count() */ + void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0); +}; + +/**********************************************************************//** +Compute the hash fold value for blocks in buf_pool.zip_hash. */ +/* @{ */ +#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift) +#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) +#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) +/* @} */ + +/** A "Hazard Pointer" class used to iterate over page lists +inside the buffer pool. A hazard pointer is a buf_page_t pointer +which we intend to iterate over next and we want it remain valid +even after we release the buffer pool mutex. */ +class HazardPointer +{ +public: + virtual ~HazardPointer() {} + + /** @return current value */ + buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; } + + /** Set current value + @param bpage buffer block to be set as hp */ + void set(buf_page_t *bpage) + { + mysql_mutex_assert_owner(m_mutex); + ut_ad(!bpage || bpage->in_file()); + m_hp= bpage; + } + + /** Checks if a bpage is the hp + @param bpage buffer block to be compared + @return true if it is hp */ + bool is_hp(const buf_page_t *bpage) const + { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; } + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. */ + virtual void adjust(const buf_page_t*) = 0; + +#ifdef UNIV_DEBUG + /** mutex that protects access to the m_hp. */ + const mysql_mutex_t *m_mutex= nullptr; +#endif /* UNIV_DEBUG */ + +protected: + /** hazard pointer */ + buf_page_t *m_hp= nullptr; +}; + +/** Class implementing buf_pool.flush_list hazard pointer */ +class FlushHp : public HazardPointer +{ +public: + ~FlushHp() override {} + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. + @param bpage buffer block to be compared */ + void adjust(const buf_page_t *bpage) override + { + ut_ad(bpage != NULL); + + /* We only support reverse traversal for now. */ + if (is_hp(bpage)) + m_hp= UT_LIST_GET_PREV(list, m_hp); + + ut_ad(!m_hp || m_hp->oldest_modification()); + } +}; + +/** Class implementing buf_pool.LRU hazard pointer */ +class LRUHp : public HazardPointer { +public: + ~LRUHp() override {} + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. + @param bpage buffer block to be compared */ + void adjust(const buf_page_t *bpage) override + { + ut_ad(bpage); + /** We only support reverse traversal for now. */ + if (is_hp(bpage)) + m_hp= UT_LIST_GET_PREV(LRU, m_hp); + + ut_ad(!m_hp || m_hp->in_LRU_list); + } +}; + +/** Special purpose iterators to be used when scanning the LRU list. +The idea is that when one thread finishes the scan it leaves the +itr in that position and the other thread can start scan from +there */ +class LRUItr : public LRUHp { +public: + LRUItr() : LRUHp() {} + ~LRUItr() override {} + + /** Select from where to start a scan. If we have scanned + too deep into the LRU list it resets the value to the tail + of the LRU list. + @return buf_page_t from where to start scan. */ + inline buf_page_t *start(); +}; + +/** Struct that is embedded in the free zip blocks */ +struct buf_buddy_free_t { + union { + ulint size; /*!< size of the block */ + byte bytes[FIL_PAGE_DATA]; + /*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID] + == BUF_BUDDY_FREE_STAMP denotes a free + block. If the space_id field of buddy + block != BUF_BUDDY_FREE_STAMP, the block + is not in any zip_free list. If the + space_id is BUF_BUDDY_FREE_STAMP then + stamp[0] will contain the + buddy block size. */ + } stamp; + + buf_page_t bpage; /*!< Embedded bpage descriptor */ + UT_LIST_NODE_T(buf_buddy_free_t) list; + /*!< Node of zip_free list */ +}; + +/** @brief The buffer pool statistics structure. */ +struct buf_pool_stat_t{ + ulint n_page_gets; /*!< number of page gets performed; + also successful searches through + the adaptive hash index are + counted as page gets; this field + is NOT protected by the buffer + pool mutex */ + ulint n_pages_read; /*!< number read operations */ + ulint n_pages_written;/*!< number write operations */ + ulint n_pages_created;/*!< number of pages created + in the pool with no read */ + ulint n_ra_pages_read_rnd;/*!< number of pages read in + as part of random read ahead */ + ulint n_ra_pages_read;/*!< number of pages read in + as part of read ahead */ + ulint n_ra_pages_evicted;/*!< number of read ahead + pages that are evicted without + being accessed */ + ulint n_pages_made_young; /*!< number of pages made young, in + buf_page_make_young() */ + ulint n_pages_not_made_young; /*!< number of pages not made + young because the first access + was not long enough ago, in + buf_page_peek_if_too_old() */ + /** number of waits for eviction; writes protected by buf_pool.mutex */ + ulint LRU_waits; + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint flush_list_bytes;/*!< flush_list size in bytes */ +}; + +/** Statistics of buddy blocks of a given size. */ +struct buf_buddy_stat_t { + /** Number of blocks allocated from the buddy system. */ + ulint used; + /** Number of blocks relocated by the buddy system. */ + ib_uint64_t relocated; + /** Total duration of block relocations, in microseconds. */ + ib_uint64_t relocated_usec; +}; + +/** The buffer pool */ +class buf_pool_t +{ + /** A chunk of buffers */ + struct chunk_t + { + /** number of elements in blocks[] */ + size_t size; + /** memory allocated for the page frames */ + unsigned char *mem; + /** descriptor of mem */ + ut_new_pfx_t mem_pfx; + /** array of buffer control blocks */ + buf_block_t *blocks; + + /** Map of first page frame address to chunks[] */ + using map= std::map<const void*, chunk_t*, std::less<const void*>, + ut_allocator<std::pair<const void* const,chunk_t*>>>; + /** Chunk map that may be under construction by buf_resize_thread() */ + static map *map_reg; + /** Current chunk map for lookup only */ + static map *map_ref; + + /** @return the memory size bytes. */ + size_t mem_size() const { return mem_pfx.m_size; } + + /** Register the chunk */ + void reg() { map_reg->emplace(map::value_type(blocks->frame, this)); } + + /** Allocate a chunk of buffer frames. + @param bytes requested size + @return whether the allocation succeeded */ + inline bool create(size_t bytes); + +#ifdef UNIV_DEBUG + /** Find a block that points to a ROW_FORMAT=COMPRESSED page + @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame + @return the block + @retval nullptr if not found */ + const buf_block_t *contains_zip(const void *data) const + { + const buf_block_t *block= blocks; + for (auto i= size; i--; block++) + if (block->page.zip.data == data) + return block; + return nullptr; + } + + /** Check that all blocks are in a replaceable state. + @return address of a non-free block + @retval nullptr if all freed */ + inline const buf_block_t *not_freed() const; +#endif /* UNIV_DEBUG */ + }; + + /** Withdraw blocks from the buffer pool until meeting withdraw_target. + @return whether retry is needed */ + inline bool withdraw_blocks(); + + /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to + the buf_block_t itself or a member of it. + @param ptr a pointer that will not be dereferenced + @return whether the ptr belongs to a buf_block_t struct */ + bool is_block_field(const void *ptr) const + { + const chunk_t *chunk= chunks; + const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new); + + /* TODO: protect chunks with a mutex (the older pointer will + currently remain during resize()) */ + for (; chunk < echunk; chunk++) + if (ptr >= reinterpret_cast<const void*>(chunk->blocks) && + ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size)) + return true; + return false; + } + + /** Try to reallocate a control block. + @param block control block to reallocate + @return whether the reallocation succeeded */ + inline bool realloc(buf_block_t *block); + +public: + bool is_initialised() const { return chunks != nullptr; } + + /** Create the buffer pool. + @return whether the creation failed */ + bool create(); + + /** Clean up after successful create() */ + void close(); + + /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ + inline void resize(); + + /** @return whether resize() is in progress */ + bool resize_in_progress() const + { + return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed)); + } + + /** @return the current size in blocks */ + size_t get_n_pages() const + { + ut_ad(is_initialised()); + size_t size= 0; + for (auto j= n_chunks; j--; ) + size+= chunks[j].size; + return size; + } + + /** Determine whether a frame is intended to be withdrawn during resize(). + @param ptr pointer within a buf_block_t::frame + @return whether the frame will be withdrawn */ + bool will_be_withdrawn(const byte *ptr) const + { + ut_ad(curr_size < old_size); +#ifdef SAFE_MUTEX + if (resizing.load(std::memory_order_relaxed)) + mysql_mutex_assert_owner(&mutex); +#endif /* SAFE_MUTEX */ + + for (const chunk_t *chunk= chunks + n_chunks_new, + * const echunk= chunks + n_chunks; + chunk != echunk; chunk++) + if (ptr >= chunk->blocks->frame && + ptr < (chunk->blocks + chunk->size - 1)->frame + srv_page_size) + return true; + return false; + } + + /** Determine whether a block is intended to be withdrawn during resize(). + @param bpage buffer pool block + @return whether the frame will be withdrawn */ + bool will_be_withdrawn(const buf_page_t &bpage) const + { + ut_ad(curr_size < old_size); +#ifdef SAFE_MUTEX + if (resizing.load(std::memory_order_relaxed)) + mysql_mutex_assert_owner(&mutex); +#endif /* SAFE_MUTEX */ + + for (const chunk_t *chunk= chunks + n_chunks_new, + * const echunk= chunks + n_chunks; + chunk != echunk; chunk++) + if (&bpage >= &chunk->blocks->page && + &bpage < &chunk->blocks[chunk->size].page) + return true; + return false; + } + + /** Release and evict a corrupted page. + @param bpage page that was being read */ + ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage); + + /** Release a memory block to the buffer pool. */ + ATTRIBUTE_COLD void free_block(buf_block_t *block); + +#ifdef UNIV_DEBUG + /** Find a block that points to a ROW_FORMAT=COMPRESSED page + @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame + @return the block + @retval nullptr if not found */ + const buf_block_t *contains_zip(const void *data) const + { + mysql_mutex_assert_owner(&mutex); + for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks; + chunk != end; chunk++) + if (const buf_block_t *block= chunk->contains_zip(data)) + return block; + return nullptr; + } + + /** Assert that all buffer pool pages are in a replaceable state */ + void assert_all_freed(); +#endif /* UNIV_DEBUG */ + +#ifdef BTR_CUR_HASH_ADAPT + /** Clear the adaptive hash index on all pages in the buffer pool. */ + inline void clear_hash_index(); + + /** Get a buffer block from an adaptive hash index pointer. + This function does not return if the block is not identified. + @param ptr pointer to within a page frame + @return pointer to block, never NULL */ + inline buf_block_t *block_from_ahi(const byte *ptr) const; +#endif /* BTR_CUR_HASH_ADAPT */ + + bool is_block_lock(const rw_lock_t *l) const + { return is_block_field(static_cast<const void*>(l)); } + + /** + @return the smallest oldest_modification lsn for any page + @retval empty_lsn if all modified persistent pages have been flushed */ + lsn_t get_oldest_modification(lsn_t empty_lsn) + { + mysql_mutex_assert_owner(&flush_list_mutex); + while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list)) + { + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + lsn_t lsn= bpage->oldest_modification(); + if (lsn != 1) + { + ut_ad(lsn > 2); + return lsn; + } + delete_from_flush_list(bpage); + } + return empty_lsn; + } + + /** Determine if a buffer block was created by chunk_t::create(). + @param block block descriptor (not dereferenced) + @return whether block has been created by chunk_t::create() */ + bool is_uncompressed(const buf_block_t *block) const + { + return is_block_field(reinterpret_cast<const void*>(block)); + } + + /** Get the page_hash latch for a page */ + page_hash_latch *hash_lock_get(const page_id_t id) const + { + return page_hash.lock_get(id.fold()); + } + + /** Look up a block descriptor. + @param id page identifier + @param fold id.fold() + @return block descriptor, possibly in watch[] + @retval nullptr if not found*/ + buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold) + { + ut_ad(id.fold() == fold); +#ifdef SAFE_MUTEX + DBUG_ASSERT(mysql_mutex_is_owner(&mutex) || + page_hash.lock_get(fold)->is_locked()); +#endif /* SAFE_MUTEX */ + buf_page_t *bpage; + /* Look for the page in the hash table */ + HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->in_page_hash), id == bpage->id()); + return bpage; + } +private: + /** Look up a block descriptor. + @tparam exclusive whether the latch is to be acquired exclusively + @tparam watch whether to allow watch_is_sentinel() + @param page_id page identifier + @param fold page_id.fold() + @param hash_lock pointer to the acquired latch (to be released by caller) + @return pointer to the block + @retval nullptr if no block was found; !lock || !*lock will also hold */ + template<bool exclusive,bool watch> + buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold, + page_hash_latch **hash_lock) + { + ut_ad(hash_lock || !exclusive); + page_hash_latch *latch= page_hash.lock<exclusive>(fold); + buf_page_t *bpage= page_hash_get_low(page_id, fold); + if (!bpage || watch_is_sentinel(*bpage)) + { + latch->release<exclusive>(); + if (hash_lock) + *hash_lock= nullptr; + return watch ? bpage : nullptr; + } + + ut_ad(bpage->in_file()); + ut_ad(page_id == bpage->id()); + + if (hash_lock) + *hash_lock= latch; /* to be released by the caller */ + else + latch->release<exclusive>(); + return bpage; + } +public: + /** Look up a block descriptor. + @tparam exclusive whether the latch is to be acquired exclusively + @param page_id page identifier + @param fold page_id.fold() + @param hash_lock pointer to the acquired latch (to be released by caller) + @return pointer to the block + @retval nullptr if no block was found; !lock || !*lock will also hold */ + template<bool exclusive> + buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold, + page_hash_latch **hash_lock) + { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); } + + /** @return whether the buffer pool contains a page + @tparam watch whether to allow watch_is_sentinel() + @param page_id page identifier */ + template<bool watch= false> + bool page_hash_contains(const page_id_t page_id) + { + return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr); + } + + /** Determine if a block is a sentinel for a buffer pool watch. + @param bpage page descriptor + @return whether bpage a sentinel for a buffer pool watch */ + bool watch_is_sentinel(const buf_page_t &bpage) + { +#ifdef SAFE_MUTEX + DBUG_ASSERT(mysql_mutex_is_owner(&mutex) || + hash_lock_get(bpage.id())->is_locked()); +#endif /* SAFE_MUTEX */ + ut_ad(bpage.in_file()); + + if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)]) + { + ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data); + return false; + } + + ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE); + ut_ad(!bpage.in_zip_hash); + ut_ad(!bpage.zip.data); + return true; + } + + /** Check if a watched page has been read. + This may only be called after !watch_set() and before invoking watch_unset(). + @param id page identifier + @return whether the page was read to the buffer pool */ + bool watch_occurred(const page_id_t id) + { + const ulint fold= id.fold(); + page_hash_latch *hash_lock= page_hash.lock<false>(fold); + /* The page must exist because watch_set() increments buf_fix_count. */ + buf_page_t *bpage= page_hash_get_low(id, fold); + const bool is_sentinel= watch_is_sentinel(*bpage); + hash_lock->read_unlock(); + return !is_sentinel; + } + + /** Register a watch for a page identifier. The caller must hold an + exclusive page hash latch. The *hash_lock may be released, + relocated, and reacquired. + @param id page identifier + @param hash_lock exclusively held page_hash latch + @return a buffer pool block corresponding to id + @retval nullptr if the block was not present, and a watch was installed */ + inline buf_page_t *watch_set(const page_id_t id, + page_hash_latch **hash_lock); + + /** Stop watching whether a page has been read in. + watch_set(id) must have returned nullptr before. + @param id page identifier */ + void watch_unset(const page_id_t id) + { + const ulint fold= id.fold(); + page_hash_latch *hash_lock= page_hash.lock<true>(fold); + /* The page must exist because watch_set() increments buf_fix_count. */ + buf_page_t *watch= page_hash_get_low(id, fold); + if (watch->unfix() == 0 && watch_is_sentinel(*watch)) + { + /* The following is based on watch_remove(). */ + ut_ad(watch->in_page_hash); + ut_d(watch->in_page_hash= false); + HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch); + hash_lock->write_unlock(); + // Now that the watch is detached from page_hash, release it to watch[]. + mysql_mutex_lock(&mutex); + /* It is possible that watch_remove() already removed the watch. */ + if (watch->id_ == id) + { + ut_ad(!watch->buf_fix_count()); + ut_ad(watch->state() == BUF_BLOCK_ZIP_PAGE); + watch->set_state(BUF_BLOCK_NOT_USED); + } + mysql_mutex_unlock(&mutex); + } + else + hash_lock->write_unlock(); + } + + /** Remove the sentinel block for the watch before replacing it with a + real block. watch_unset() or watch_occurred() will notice + that the block has been replaced with the real block. + @param watch sentinel */ + inline void watch_remove(buf_page_t *watch); + + /** @return whether less than 1/4 of the buffer pool is available */ + bool running_out() const + { + return !recv_recovery_is_on() && + UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < + std::min(curr_size, old_size) / 4); + } + +#ifdef UNIV_DEBUG + /** Validate the buffer pool. */ + void validate(); +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG + /** Write information of the buf_pool to the error log. */ + void print(); +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + + /** Remove a block from the LRU list. + @return the predecessor in the LRU list */ + buf_page_t *LRU_remove(buf_page_t *bpage) + { + mysql_mutex_assert_owner(&mutex); + ut_ad(bpage->in_LRU_list); + ut_ad(bpage->in_page_hash); + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_file()); + lru_hp.adjust(bpage); + lru_scan_itr.adjust(bpage); + ut_d(bpage->in_LRU_list= false); + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + UT_LIST_REMOVE(LRU, bpage); + return prev; + } + + /** Number of pages to read ahead */ + static constexpr uint32_t READ_AHEAD_PAGES= 64; + + /** Buffer pool mutex */ + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; + /** Number of pending LRU flush; protected by mutex. */ + ulint n_flush_LRU_; + /** broadcast when n_flush_LRU reaches 0; protected by mutex */ + pthread_cond_t done_flush_LRU; + /** Number of pending flush_list flush; protected by mutex */ + ulint n_flush_list_; + /** broadcast when n_flush_list reaches 0; protected by mutex */ + pthread_cond_t done_flush_list; + + TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; } + TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; } + + /** @name General fields */ + /* @{ */ + ulint curr_pool_size; /*!< Current pool size in bytes */ + ulint LRU_old_ratio; /*!< Reserve this much of the buffer + pool for "old" blocks */ +#ifdef UNIV_DEBUG + ulint buddy_n_frames; /*!< Number of frames allocated from + the buffer pool to the buddy system */ + ulint mutex_exit_forbidden; /*!< Forbid release mutex */ +#endif + ut_allocator<unsigned char> allocator; /*!< Allocator used for + allocating memory for the the "chunks" + member. */ + volatile ulint n_chunks; /*!< number of buffer pool chunks */ + volatile ulint n_chunks_new; /*!< new number of buffer pool chunks */ + chunk_t* chunks; /*!< buffer pool chunks */ + chunk_t* chunks_old; /*!< old buffer pool chunks to be freed + after resizing buffer pool */ + /** current pool size in pages */ + Atomic_counter<ulint> curr_size; + /** previous pool size in pages */ + Atomic_counter<ulint> old_size; + /** read-ahead request size in pages */ + Atomic_counter<uint32_t> read_ahead_area; + + /** Hash table with singly-linked overflow lists. @see hash_table_t */ + struct page_hash_table + { + /** Number of array[] elements per page_hash_latch. + Must be one less than a power of 2. */ + static constexpr size_t ELEMENTS_PER_LATCH= CPU_LEVEL1_DCACHE_LINESIZE / + sizeof(void*) - 1; + + /** number of payload elements in array[] */ + Atomic_relaxed<ulint> n_cells; + /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */ + hash_cell_t *array; + + /** Create the hash table. + @param n the lower bound of n_cells */ + void create(ulint n); + + /** Free the hash table. */ + void free() { aligned_free(array); array= nullptr; } + + /** @return the index of an array element */ + ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); } + /** @return raw array index converted to padded index */ + static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; } + private: + /** @return the hash value before any ELEMENTS_PER_LATCH padding */ + static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); } + + /** @return the index of an array element */ + static ulint calc_hash(ulint fold, ulint n_cells) + { + return pad(hash(fold, n_cells)); + } + /** Get a page_hash latch. */ + page_hash_latch *lock_get(ulint fold, ulint n) const + { + static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH), + "must be one less than a power of 2"); + return reinterpret_cast<page_hash_latch*> + (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]); + } + public: + /** Get a page_hash latch. */ + page_hash_latch *lock_get(ulint fold) const + { return lock_get(fold, n_cells); } + + /** Acquire an array latch. + @tparam exclusive whether the latch is to be acquired exclusively + @param fold hash bucket key */ + template<bool exclusive> page_hash_latch *lock(ulint fold) + { + page_hash_latch *latch= lock_get(fold, n_cells); + latch->acquire<exclusive>(); + return latch; + } + + /** Exclusively aqcuire all latches */ + inline void write_lock_all(); + + /** Release all latches */ + inline void write_unlock_all(); + }; + + /** Hash table of file pages (buf_page_t::in_file() holds), + indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */ + page_hash_table page_hash; + + /** map of block->frame to buf_block_t blocks that belong + to buf_buddy_alloc(); protected by buf_pool.mutex */ + hash_table_t zip_hash; + /** number of pending read operations */ + Atomic_counter<ulint> n_pend_reads; + Atomic_counter<ulint> + n_pend_unzip; /*!< number of pending decompressions */ + + time_t last_printout_time; + /*!< when buf_print_io was last time + called */ + buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; + /*!< Statistics of buddy system, + indexed by block size */ + buf_pool_stat_t stat; /*!< current statistics */ + buf_pool_stat_t old_stat; /*!< old statistics */ + + /* @} */ + + /** @name Page flushing algorithm fields */ + /* @{ */ + + /** mutex protecting flush_list, buf_page_t::set_oldest_modification() + and buf_page_t::list pointers when !oldest_modification() */ + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex; + /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */ + FlushHp flush_hp; + /** modified blocks (a subset of LRU) */ + UT_LIST_BASE_NODE_T(buf_page_t) flush_list; +private: + /** whether the page cleaner needs wakeup from indefinite sleep */ + bool page_cleaner_is_idle; + /** track server activity count for signaling idle flushing */ + ulint last_activity_count; +public: + /** signalled to wake up the page_cleaner; protected by flush_list_mutex */ + pthread_cond_t do_flush_list; + + /** @return whether the page cleaner must sleep due to being idle */ + bool page_cleaner_idle() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_is_idle; + } + /** Wake up the page cleaner if needed */ + inline void page_cleaner_wakeup(); + + /** Register whether an explicit wakeup of the page cleaner is needed */ + void page_cleaner_set_idle(bool deep_sleep) + { + mysql_mutex_assert_owner(&flush_list_mutex); + page_cleaner_is_idle= deep_sleep; + } + + /** Update server last activity count */ + void update_last_activity_count(ulint activity_count) + { + mysql_mutex_assert_owner(&flush_list_mutex); + last_activity_count= activity_count; + } + + // n_flush_LRU() + n_flush_list() + // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list + + unsigned freed_page_clock;/*!< a sequence number used + to count the number of buffer + blocks removed from the end of + the LRU list; NOTE that this + counter may wrap around at 4 + billion! A thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + bool try_LRU_scan; /*!< Cleared when an LRU + scan for free block fails. This + flag is used to avoid repeated + scans of LRU list when we know + that there is no free block + available in the scan depth for + eviction. Set whenever + we flush a batch from the + buffer pool. Protected by the + buf_pool.mutex */ + /* @} */ + + /** @name LRU replacement algorithm fields */ + /* @{ */ + + UT_LIST_BASE_NODE_T(buf_page_t) free; + /*!< base node of the free + block list */ + /** signaled each time when the free list grows; protected by mutex */ + pthread_cond_t done_free; + + UT_LIST_BASE_NODE_T(buf_page_t) withdraw; + /*!< base node of the withdraw + block list. It is only used during + shrinking buffer pool size, not to + reuse the blocks will be removed */ + + ulint withdraw_target;/*!< target length of withdraw + block list, when withdrawing */ + + /** "hazard pointer" used during scan of LRU while doing + LRU list batch. Protected by buf_pool_t::mutex. */ + LRUHp lru_hp; + + /** Iterator used to scan the LRU list when searching for + replacable victim. Protected by buf_pool_t::mutex. */ + LRUItr lru_scan_itr; + + UT_LIST_BASE_NODE_T(buf_page_t) LRU; + /*!< base node of the LRU list */ + + buf_page_t* LRU_old; /*!< pointer to the about + LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV + oldest blocks in the LRU list; + NULL if LRU length less than + BUF_LRU_OLD_MIN_LEN; + NOTE: when LRU_old != NULL, its length + should always equal LRU_old_len */ + ulint LRU_old_len; /*!< length of the LRU list from + the block to which LRU_old points + onward, including that block; + see buf0lru.cc for the restrictions + on this value; 0 if LRU_old == NULL; + NOTE: LRU_old_len must be adjusted + whenever LRU_old shrinks or grows! */ + + UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; + /*!< base node of the + unzip_LRU list */ + + /* @} */ + /** free ROW_FORMAT=COMPRESSED page frames */ + UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; +#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN +# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" +#endif + + /** Sentinels to detect if pages are read into the buffer pool while + a delete-buffering operation is pending. Protected by mutex. */ + buf_page_t watch[innodb_purge_threads_MAX + 1]; + /** Reserve a buffer. */ + buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); } + + /** @return whether any I/O is pending */ + bool any_io_pending() const + { + return n_pend_reads || n_flush_LRU() || n_flush_list(); + } + /** @return total amount of pending I/O */ + ulint io_pending() const + { + return n_pend_reads + n_flush_LRU() + n_flush_list(); + } + +private: + /** Remove a block from the flush list. */ + inline void delete_from_flush_list_low(buf_page_t *bpage); + /** Remove a block from flush_list. + @param bpage buffer pool page + @param clear whether to invoke buf_page_t::clear_oldest_modification() */ + void delete_from_flush_list(buf_page_t *bpage, bool clear); +public: + /** Remove a block from flush_list. + @param bpage buffer pool page */ + void delete_from_flush_list(buf_page_t *bpage) + { delete_from_flush_list(bpage, true); } + + /** Insert a modified block into the flush list. + @param block modified block + @param lsn start LSN of the mini-transaction that modified the block */ + void insert_into_flush_list(buf_block_t *block, lsn_t lsn); + + /** Free a page whose underlying file page has been freed. */ + inline void release_freed_page(buf_page_t *bpage); + +private: + /** Temporary memory for page_compressed and encrypted I/O */ + struct io_buf_t + { + /** number of elements in slots[] */ + ulint n_slots; + /** array of slots */ + buf_tmp_buffer_t *slots; + + void create(ulint n_slots) + { + this->n_slots= n_slots; + slots= static_cast<buf_tmp_buffer_t*> + (ut_malloc_nokey(n_slots * sizeof *slots)); + memset((void*) slots, 0, n_slots * sizeof *slots); + } + + void close() + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); + slots= nullptr; + n_slots= 0; + } + + /** Reserve a buffer */ + buf_tmp_buffer_t *reserve() + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + return nullptr; + } + } io_buf; + + /** whether resize() is in the critical path */ + std::atomic<bool> resizing; +}; + +/** The InnoDB buffer pool */ +extern buf_pool_t buf_pool; + +inline void page_hash_latch::read_lock() +{ + mysql_mutex_assert_not_owner(&buf_pool.mutex); + if (!read_trylock()) + read_lock_wait(); +} + +inline void page_hash_latch::write_lock() +{ + if (!write_trylock()) + write_lock_wait(); +} + +inline void buf_page_t::add_buf_fix_count(uint32_t count) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + buf_fix_count_+= count; +} + +inline void buf_page_t::set_buf_fix_count(uint32_t count) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + buf_fix_count_= count; +} + +inline void buf_page_t::set_state(buf_page_state state) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); +#ifdef UNIV_DEBUG + switch (state) { + case BUF_BLOCK_REMOVE_HASH: + /* buf_pool_t::corrupted_evict() invokes set_corrupt_id() + before buf_LRU_free_one_page(), so we cannot assert that + we are holding the hash_lock. */ + break; + case BUF_BLOCK_MEMORY: + if (!in_file()) break; + /* fall through */ + case BUF_BLOCK_FILE_PAGE: + ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked()); + break; + case BUF_BLOCK_NOT_USED: + if (!in_file()) break; + /* fall through */ + case BUF_BLOCK_ZIP_PAGE: + ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() || + (this >= &buf_pool.watch[0] && + this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)])); + break; + } +#endif + state_= state; +} + +inline void buf_page_t::set_io_fix(buf_io_fix io_fix) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + io_fix_= io_fix; +} + +inline void buf_page_t::set_corrupt_id() +{ +#ifdef UNIV_DEBUG + switch (oldest_modification()) { + case 0: + break; + case 2: + ut_ad(fsp_is_system_temporary(id().space())); + /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */ + ut_d(oldest_modification_= 0;) + break; + default: + ut_ad("block is dirty" == 0); + } + switch (state()) { + case BUF_BLOCK_REMOVE_HASH: + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_FILE_PAGE: + ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked()); + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + ut_ad("invalid state" == 0); + } +#endif + id_= page_id_t(~0ULL); +} + +/** Set oldest_modification when adding to buf_pool.flush_list */ +inline void buf_page_t::set_oldest_modification(lsn_t lsn) +{ + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + ut_ad(oldest_modification() <= 1); + ut_ad(lsn > 2); + oldest_modification_= lsn; +} + +/** Clear oldest_modification after removing from buf_pool.flush_list */ +inline void buf_page_t::clear_oldest_modification() +{ + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + ut_d(const auto state= state_); + ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE || + state == BUF_BLOCK_REMOVE_HASH); + ut_ad(oldest_modification()); + ut_ad(!list.prev); + ut_ad(!list.next); + /* We must use release memory order to guarantee that callers of + oldest_modification_acquire() will observe the block as + being detached from buf_pool.flush_list, after reading the value 0. */ + oldest_modification_.store(0, std::memory_order_release); +} + +/** Note that a block is no longer dirty, while not removing +it from buf_pool.flush_list */ +inline void buf_page_t::clear_oldest_modification(bool temporary) +{ + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + ut_ad(temporary == fsp_is_system_temporary(id().space())); + ut_ad(io_fix_ == BUF_IO_WRITE); + if (temporary) + { + ut_ad(oldest_modification() == 2); + oldest_modification_= 0; + } + else + { + /* We use release memory order to guarantee that callers of + oldest_modification_acquire() will observe the block as + being detached from buf_pool.flush_list, after reading the value 0. */ + ut_ad(oldest_modification() > 2); + oldest_modification_.store(1, std::memory_order_release); + } +} + +/** @return whether the block is modified and ready for flushing */ +inline bool buf_page_t::ready_for_flush() const +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(in_LRU_list); + ut_a(in_file()); + ut_ad(fsp_is_system_temporary(id().space()) + ? oldest_modification() == 2 + : oldest_modification() > 2); + return io_fix_ == BUF_IO_NONE; +} + +/** @return whether the block can be relocated in memory. +The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ +inline bool buf_page_t::can_relocate() const +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(in_file()); + ut_ad(in_LRU_list); + return io_fix_ == BUF_IO_NONE && !buf_fix_count_; +} + +/** @return whether the block has been flagged old in buf_pool.LRU */ +inline bool buf_page_t::is_old() const +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(in_file()); + ut_ad(in_LRU_list); + return old; +} + +/** Set whether a block is old in buf_pool.LRU */ +inline void buf_page_t::set_old(bool old) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(in_LRU_list); + +#ifdef UNIV_LRU_DEBUG + ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr)); + /* If a block is flagged "old", the LRU_old list must exist. */ + ut_a(!old || buf_pool.LRU_old); + + if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this)) + { + const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this); + const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this); + if (prev->old == next->old) + ut_a(prev->old == old); + else + { + ut_a(!prev->old); + ut_a(buf_pool.LRU_old == (old ? this : next)); + } + } +#endif /* UNIV_LRU_DEBUG */ + + this->old= old; +} + +#ifdef UNIV_DEBUG +/** Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid() do { \ + mysql_mutex_assert_owner(&buf_pool.mutex); \ + buf_pool.mutex_exit_forbidden++; \ +} while (0) +/** Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow() do { \ + mysql_mutex_assert_owner(&buf_pool.mutex); \ + ut_ad(buf_pool.mutex_exit_forbidden--); \ +} while (0) +#else +/** Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid() ((void) 0) +/** Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow() ((void) 0) +#endif + +/********************************************************************** +Let us list the consistency conditions for different control block states. + +NOT_USED: is in free list, not in LRU list, not in flush list, nor + page hash table +MEMORY: is not in free list, LRU list, or flush list, nor page + hash table +FILE_PAGE: space and offset are defined, is in page hash table + if io_fix == BUF_IO_WRITE, + buf_pool.n_flush_LRU() || buf_pool.n_flush_list() + + (1) if buf_fix_count == 0, then + is in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + is x-locked, + if and only if io_fix == BUF_IO_READ + is s-locked, + if and only if io_fix == BUF_IO_WRITE + + (2) if buf_fix_count > 0, then + is not in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + if io_fix == BUF_IO_READ, + is x-locked + if io_fix == BUF_IO_WRITE, + is s-locked + +State transitions: + +NOT_USED => MEMORY +MEMORY => FILE_PAGE +MEMORY => NOT_USED +FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if + (1) buf_fix_count == 0, + (2) oldest_modification == 0, and + (3) io_fix == 0. +*/ + +/** Select from where to start a scan. If we have scanned +too deep into the LRU list it resets the value to the tail +of the LRU list. +@return buf_page_t from where to start scan. */ +inline buf_page_t *LRUItr::start() +{ + mysql_mutex_assert_owner(m_mutex); + + if (!m_hp || m_hp->old) + m_hp= UT_LIST_GET_LAST(buf_pool.LRU); + + return m_hp; +} + +#ifdef UNIV_DEBUG +/** Functor to validate the LRU list. */ +struct CheckInLRUList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_LRU_list); + } + + static void validate() + { + ut_list_validate(buf_pool.LRU, CheckInLRUList()); + } +}; + +/** Functor to validate the LRU list. */ +struct CheckInFreeList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_free_list); + } + + static void validate() + { + ut_list_validate(buf_pool.free, CheckInFreeList()); + } +}; + +struct CheckUnzipLRUAndLRUList { + void operator()(const buf_block_t* elem) const + { + ut_a(elem->page.in_LRU_list); + ut_a(elem->in_unzip_LRU_list); + } + + static void validate() + { + ut_list_validate(buf_pool.unzip_LRU, + CheckUnzipLRUAndLRUList()); + } +}; +#endif /* UNIV_DEBUG */ + +#include "buf0buf.ic" + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic new file mode 100644 index 00000000..4d8cef4c --- /dev/null +++ b/storage/innobase/include/buf0buf.ic @@ -0,0 +1,422 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2014, 2020, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.ic +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0mtr.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "fsp0types.h" + +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void) +/*========================*/ +{ + return(srv_buf_pool_curr_size); +} + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +unsigned +buf_page_get_freed_page_clock( +/*==========================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + /* This is sometimes read without holding buf_pool.mutex. */ + return(bpage->freed_page_clock); +} + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +unsigned +buf_block_get_freed_page_clock( +/*===========================*/ + const buf_block_t* block) /*!< in: block */ +{ + return(buf_page_get_freed_page_clock(&block->page)); +} + +/** Determine if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +The page must be either buffer-fixed, or its page hash must be locked. +@param[in] bpage buffer pool page +@return whether bpage is close to MRU end of LRU */ +inline bool buf_page_peek_if_young(const buf_page_t *bpage) +{ + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool.freed_page_clock & ((1UL << 31) - 1)) + < (bpage->freed_page_clock + + (buf_pool.curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4)))); +} + +/** Determine if a block should be moved to the start of the LRU list if +there is danger of dropping from the buffer pool. +@param[in] bpage buffer pool page +@return true if bpage should be made younger */ +inline bool buf_page_peek_if_too_old(const buf_page_t *bpage) +{ + if (buf_pool.freed_page_clock == 0) { + /* If eviction has not started yet, do not update the + statistics or move blocks in the LRU list. This is + either the warm-up phase or an in-memory workload. */ + return(FALSE); + } else if (buf_LRU_old_threshold_ms && bpage->old) { + uint32_t access_time = bpage->is_accessed(); + + /* It is possible that the below comparison returns an + unexpected result. 2^32 milliseconds pass in about 50 days, + so if the difference between ut_time_ms() and access_time + is e.g. 50 days + 15 ms, then the below will behave as if + it is 15 ms. This is known and fixing it would require to + increase buf_page_t::access_time from 32 to 64 bits. */ + if (access_time + && ((ib_uint32_t) (ut_time_ms() - access_time)) + >= buf_LRU_old_threshold_ms) { + return(TRUE); + } + + buf_pool.stat.n_pages_not_made_young++; + return false; + } else { + return !buf_page_peek_if_young(bpage); + } +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets a pointer to the memory frame of a block. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + if (!block) { + return NULL; + } + + switch (block->page.state()) { + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: + ut_a(block->page.buf_fix_count()); + /* fall through */ + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + goto ok; + } + ut_error; +ok: + return((buf_frame_t*) block->frame); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. +@return: the allocated descriptor. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ +{ + buf_page_t* bpage; + + bpage = (buf_page_t*) ut_zalloc_nokey(sizeof *bpage); + ut_ad(bpage); + MEM_UNDEFINED(bpage, sizeof *bpage); + + return(bpage); +} + +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ +{ + ut_free(bpage); +} + +/** Allocate a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +inline buf_block_t *buf_block_alloc() +{ + return buf_LRU_get_free_block(false); +} + +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /*!< in, own: block to be freed */ +{ + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&buf_pool.mutex); +} + +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef SAFE_MUTEX + /* No latch is acquired for the shared temporary tablespace. */ + ut_ad(fsp_is_system_temporary(block->page.id().space()) + || (mysql_mutex_is_owner(&buf_pool.mutex) + && !block->page.buf_fix_count()) + || rw_lock_own_flagged(&block->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); +#else /* SAFE_MUTEX */ + /* No latch is acquired for the shared temporary tablespace. */ + ut_ad(fsp_is_system_temporary(block->page.id().space()) + || !block->page.buf_fix_count() + || rw_lock_own_flagged(&block->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); +#endif /* SAFE_MUTEX */ + assert_block_ahi_valid(block); + + block->modify_clock++; +} + +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef UNIV_DEBUG + /* No latch is acquired for the shared temporary tablespace. */ + if (!fsp_is_system_temporary(block->page.id().space())) { + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S) + || rw_lock_own(&(block->lock), RW_LOCK_X) + || rw_lock_own(&(block->lock), RW_LOCK_SX)); + } +#endif /* UNIV_DEBUG */ + + return(block->modify_clock); +} + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line */ +#endif /* UNIV_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ +{ +#ifdef UNIV_DEBUG + /* No debug latch is acquired if block belongs to system temporary. + Debug latch is not of much help if access to block is single + threaded. */ + if (!fsp_is_system_temporary(block->page.id().space())) { + ibool ret; + ret = rw_lock_s_lock_nowait(block->debug_latch, file, line); + ut_a(ret); + } +#endif /* UNIV_DEBUG */ + + block->fix(); +} + +/*******************************************************************//** +Decrements the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_dec( +/*==================*/ + buf_block_t* block) /*!< in/out: block to bufferunfix */ +{ +#ifdef UNIV_DEBUG + /* No debug latch is acquired if block belongs to system temporary. + Debug latch is not of much help if access to block is single + threaded. */ + if (!fsp_is_system_temporary(block->page.id().space())) { + rw_lock_s_unlock(block->debug_latch); + } +#endif /* UNIV_DEBUG */ + + block->unfix(); +} + +/********************************************************************//** +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage) /*!< in: buffer block */ +{ + ut_ad(bpage); + ut_a(bpage->buf_fix_count()); + + switch (bpage->state()) { + case BUF_BLOCK_FILE_PAGE: +#ifdef UNIV_DEBUG + { + /* No debug latch is acquired if block belongs to system + temporary. Debug latch is not of much help if access to block + is single threaded. */ + buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); + if (!fsp_is_system_temporary(block->page.id().space())) { + rw_lock_s_unlock(block->debug_latch); + } + } +#endif /* UNIV_DEBUG */ + /* Fall through */ + case BUF_BLOCK_ZIP_PAGE: + reinterpret_cast<buf_block_t*>(bpage)->unfix(); + return; + + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; +} + +/********************************************************************//** +Releases a latch, if specified. */ +UNIV_INLINE +void +buf_page_release_latch( +/*===================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ +{ +#ifdef UNIV_DEBUG + /* No debug latch is acquired if block belongs to system + temporary. Debug latch is not of much help if access to block + is single threaded. */ + if (!fsp_is_system_temporary(block->page.id().space())) { + rw_lock_s_unlock(block->debug_latch); + } +#endif /* UNIV_DEBUG */ + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&block->lock); + } else if (rw_latch == RW_SX_LATCH) { + rw_lock_sx_unlock(&block->lock); + } else if (rw_latch == RW_X_LATCH) { + rw_lock_x_unlock(&block->lock); + } +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /*!< in: buffer page + where we have acquired latch */ + latch_level_t level) /*!< in: latching order level */ +{ + sync_check_lock(&block->lock, level); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Get buf frame. */ +UNIV_INLINE +void * +buf_page_get_frame( +/*===============*/ + const buf_page_t* bpage) /*!< in: buffer pool page */ +{ + /* In encryption/compression buffer pool page may contain extra + buffer where result is stored. */ + if (bpage->slot && bpage->slot->out_buf) { + return bpage->slot->out_buf; + } else if (bpage->zip.data) { + return bpage->zip.data; + } else { + return ((buf_block_t*) bpage)->frame; + } +} + +/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, +if needed. +@param[in] size size in bytes +@return aligned size */ +UNIV_INLINE +ulint +buf_pool_size_align( + ulint size) +{ + const ulong m = srv_buf_pool_chunk_unit; + size = ut_max(size, srv_buf_pool_min_size); + + if (size % m == 0) { + return(size); + } else { + return (ulint)((size / m + 1) * m); + } +} diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h new file mode 100644 index 00000000..8dc25f91 --- /dev/null +++ b/storage/innobase/include/buf0checksum.h @@ -0,0 +1,67 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.h +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0checksum_h +#define buf0checksum_h + +#include "buf0types.h" + +/** Calculate the CRC32 checksum of a page. The value is stored to the page +when it is written to a file and also checked for a match when reading from +the file. Note that we must be careful to calculate the same value on all +architectures. +@param[in] page buffer page (srv_page_size bytes) +@return CRC-32C */ +uint32_t buf_calc_page_crc32(const byte* page); + +/** Calculate a checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_new_checksum(const byte* page); + +/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that +the checksum only looked at the first few bytes of the page. +This calculates that old checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_old_checksum(const byte* page); + +/** Return a printable string describing the checksum algorithm. +@param[in] algo algorithm +@return algorithm name */ +const char* +buf_checksum_algorithm_name(srv_checksum_algorithm_t algo); + +extern ulong srv_checksum_algorithm; + +#endif /* buf0checksum_h */ diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h new file mode 100644 index 00000000..fb9df555 --- /dev/null +++ b/storage/innobase/include/buf0dblwr.h @@ -0,0 +1,170 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0dblwr.h +Doublewrite buffer module + +Created 2011/12/19 Inaam Rana +*******************************************************/ + +#pragma once + +#include "os0file.h" +#include "buf0types.h" + +/** Doublewrite control struct */ +class buf_dblwr_t +{ + struct element + { + /** asynchronous write request */ + IORequest request; + /** payload size in bytes */ + size_t size; + }; + + struct slot + { + /** first free position in write_buf measured in units of + * srv_page_size */ + ulint first_free; + /** number of slots reserved for the current write batch */ + ulint reserved; + /** the doublewrite buffer, aligned to srv_page_size */ + byte* write_buf; + /** buffer blocks to be written via write_buf */ + element* buf_block_arr; + }; + + /** the page number of the first doublewrite block (block_size() pages) */ + page_id_t block1= page_id_t(0, 0); + /** the page number of the second doublewrite block (block_size() pages) */ + page_id_t block2= page_id_t(0, 0); + + /** mutex protecting the data members below */ + mysql_mutex_t mutex; + /** condition variable for !batch_running */ + pthread_cond_t cond; + /** whether a batch is being written from the doublewrite buffer */ + bool batch_running; + /** number of expected flush_buffered_writes_completed() calls */ + unsigned flushing_buffered_writes; + /** pages submitted to flush_buffered_writes() */ + ulint pages_submitted; + /** number of flush_buffered_writes_completed() calls */ + ulint writes_completed; + /** number of pages written by flush_buffered_writes_completed() */ + ulint pages_written; + + slot slots[2]; + slot *active_slot= &slots[0]; + + /** Initialize the doublewrite buffer data structure. + @param header doublewrite page header in the TRX_SYS page */ + inline void init(const byte *header); + + /** Flush possible buffered writes to persistent storage. */ + bool flush_buffered_writes(const ulint size); + +public: + /** Create or restore the doublewrite buffer in the TRX_SYS page. + @return whether the operation succeeded */ + bool create(); + /** Free the doublewrite buffer. */ + void close(); + + /** Acquire the mutex */ + void lock() { mysql_mutex_lock(&mutex); } + /** @return the number of submitted page writes */ + ulint submitted() const + { mysql_mutex_assert_owner(&mutex); return pages_submitted; } + /** @return the number of completed batches */ + ulint batches() const + { mysql_mutex_assert_owner(&mutex); return writes_completed; } + /** @return the number of final pages written */ + ulint written() const + { mysql_mutex_assert_owner(&mutex); return pages_written; } + /** Release the mutex */ + void unlock() { mysql_mutex_unlock(&mutex); } + + /** Initialize the doublewrite buffer memory structure on recovery. + If we are upgrading from a version before MySQL 4.1, then this + function performs the necessary update operations to support + innodb_file_per_table. If we are in a crash recovery, this function + loads the pages from double write buffer into memory. + @param file File handle + @param path Path name of file + @return DB_SUCCESS or error code */ + dberr_t init_or_load_pages(pfs_os_file_t file, const char *path); + + /** Process and remove the double write buffer pages for all tablespaces. */ + void recover(); + + /** Update the doublewrite buffer on data page write completion. */ + void write_completed(); + /** Flush possible buffered writes to persistent storage. + It is very important to call this function after a batch of writes has been + posted, and also when we may have to wait for a page latch! + Otherwise a deadlock of threads can occur. */ + void flush_buffered_writes(); + /** Update the doublewrite buffer on write batch completion + @param request the completed batch write request */ + void flush_buffered_writes_completed(const IORequest &request); + + /** Size of the doublewrite block in pages */ + uint32_t block_size() const { return FSP_EXTENT_SIZE; } + + /** Schedule a page write. If the doublewrite memory buffer is full, + flush_buffered_writes() will be invoked to make space. + @param request asynchronous write request + @param size payload size in bytes */ + void add_to_batch(const IORequest &request, size_t size); + + /** Determine whether the doublewrite buffer is initialized */ + bool is_initialised() const + { return UNIV_LIKELY(block1 != page_id_t(0, 0)); } + + /** @return whether a page identifier is part of the doublewrite buffer */ + bool is_inside(const page_id_t id) const + { + if (!is_initialised()) + return false; + ut_ad(block1 < block2); + if (id < block1) + return false; + const uint32_t size= block_size(); + return id < block1 + size || (id >= block2 && id < block2 + size); + } + + /** Wait for flush_buffered_writes() to be fully completed */ + void wait_flush_buffered_writes() + { + if (is_initialised()) + { + mysql_mutex_lock(&mutex); + while (batch_running) + my_cond_wait(&cond, &mutex.m_mutex); + mysql_mutex_unlock(&mutex); + } + } +}; + +/** The doublewrite buffer */ +extern buf_dblwr_t buf_dblwr; diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h new file mode 100644 index 00000000..48586900 --- /dev/null +++ b/storage/innobase/include/buf0dump.h @@ -0,0 +1,44 @@ +/***************************************************************************** + +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.h +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0dump_h +#define buf0dump_h + +/** Start the buffer pool dump/load task and instructs it to start a dump. */ +void buf_dump_start(); +/** Start the buffer pool dump/load task and instructs it to start a load. */ +void buf_load_start(); + +/** Abort a currently running buffer pool load. */ +void buf_load_abort(); + +/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/ +void buf_load_at_startup(); + +/** Wait for currently running load/dumps to finish*/ +void buf_load_dump_end(); + +#endif /* buf0dump_h */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h new file mode 100644 index 00000000..8d45cf2b --- /dev/null +++ b/storage/innobase/include/buf0flu.h @@ -0,0 +1,153 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.h +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0flu_h +#define buf0flu_h + +#include "ut0byte.h" +#include "log0log.h" +#include "buf0types.h" + +/** Number of pages flushed. Protected by buf_pool.mutex. */ +extern ulint buf_flush_page_count; +/** Number of pages flushed via LRU. Protected by buf_pool.mutex. +Also included in buf_flush_page_count. */ +extern ulint buf_lru_flush_page_count; + +/** Flag indicating if the page_cleaner is in active state. */ +extern bool buf_page_cleaner_is_active; + +#ifdef UNIV_DEBUG + +/** Value of MySQL global variable used to disable page cleaner. */ +extern my_bool innodb_page_cleaner_disabled_debug; + +#endif /* UNIV_DEBUG */ + +/** Remove all dirty pages belonging to a given tablespace when we are +deleting the data file of that tablespace. +The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. +@param id tablespace identifier */ +void buf_flush_remove_pages(ulint id); + +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage has already been +copied to dpage. */ +ATTRIBUTE_COLD +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage); /*!< in/out: destination block */ + +/** Complete write of a file page from buf_pool. +@param request write request */ +void buf_page_write_complete(const IORequest &request); + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page); + +/** Initialize a page for writing to the tablespace. +@param[in] block buffer block; NULL if bypassing the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if uncompressed +@param[in] use_full_checksum whether tablespace uses full checksum */ +void +buf_flush_init_for_writing( + const buf_block_t* block, + byte* page, + void* page_zip_, + bool use_full_checksum); + +/** Write out dirty blocks from buf_pool.flush_list. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX); + +/** Try to flush dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Write out dirty blocks from buf_pool.LRU. +@param max_n wished maximum mumber of blocks flushed +@return the number of processed pages +@retval 0 if a buf_pool.LRU batch is already running */ +ulint buf_flush_LRU(ulint max_n); + +/** Wait until a flush batch ends. +@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ +void buf_flush_wait_batch_end(bool lru); +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious); + +/********************************************************************//** +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /*!< in: block which is modified */ + lsn_t start_lsn, /*!< in: start lsn of the first mtr in a + set of mtr's */ + lsn_t end_lsn); /*!< in: end lsn of the last mtr in the + set of mtr's */ + +/** Initialize page_cleaner. */ +ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); + +/** Wait for pending flushes to complete. */ +void buf_flush_wait_batch_end_acquiring_mutex(bool lru); + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool(); + +#ifdef UNIV_DEBUG +/** Validate the flush list. */ +void buf_flush_validate(); +#endif /* UNIV_DEBUG */ + +/** Synchronously flush dirty blocks. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync(); + +#include "buf0flu.ic" + +#endif diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic new file mode 100644 index 00000000..b8a9b6d1 --- /dev/null +++ b/storage/innobase/include/buf0flu.ic @@ -0,0 +1,66 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.ic +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "assume_aligned.h" +#include "buf0buf.h" +#include "srv0srv.h" + +/********************************************************************//** +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it is not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /*!< in: block which is modified */ + lsn_t start_lsn, /*!< in: start lsn of the mtr that + modified this block */ + lsn_t end_lsn) /*!< in: end lsn of the mtr that + modified this block */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count()); + ut_ad(mach_read_from_8(block->frame + FIL_PAGE_LSN) <= end_lsn); + mach_write_to_8(block->frame + FIL_PAGE_LSN, end_lsn); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + memcpy_aligned<8>(FIL_PAGE_LSN + block->page.zip.data, + FIL_PAGE_LSN + block->frame, 8); + } + + const lsn_t oldest_modification = block->page.oldest_modification(); + + if (oldest_modification > 1) { + ut_ad(oldest_modification <= start_lsn); + } else if (fsp_is_system_temporary(block->page.id().space())) { + block->page.set_temp_modified(); + } else { + buf_pool.insert_into_flush_list(block, start_lsn); + } + + srv_stats.buf_pool_write_requests.inc(); +} diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h new file mode 100644 index 00000000..540c14a4 --- /dev/null +++ b/storage/innobase/include/buf0lru.h @@ -0,0 +1,204 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0lru.h +The database buffer pool LRU replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0lru_h +#define buf0lru_h + +#include "ut0byte.h" +#include "buf0types.h" + +// Forward declaration +struct trx_t; +struct fil_space_t; + +/** Flush this many pages in buf_LRU_get_free_block() */ +extern size_t innodb_lru_flush_size; + +/*####################################################################### +These are low-level functions +#########################################################################*/ + +/** Minimum LRU list length for which the LRU_old pointer is defined */ +#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ + +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) + MY_ATTRIBUTE((nonnull)); + +/** Try to free a replaceable block. +@param limit maximum number of blocks to scan +@return true if found and freed */ +bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED); + +/** @return a buffer block from the buf_pool.free list +@retval NULL if the free list is empty */ +buf_block_t* buf_LRU_get_free_only(); + +/** Get a block from the buf_pool.free list. +If the list is empty, blocks will be moved from the end of buf_pool.LRU +to buf_pool.free. + +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from the buf_pool.free list, success:done + * if buf_pool.try_LRU_scan is set + * scan LRU up to 100 pages to free a clean block + * success:retry the free list + * flush up to innodb_lru_flush_size LRU blocks to data files + (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) + * on buf_page_write_complete() the blocks will put on buf_pool.free list + * success: retry the free list +* subsequent iterations: same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool.try_LRU_scan is not set + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t* buf_LRU_get_free_block(bool have_mutex) + MY_ATTRIBUTE((malloc,warn_unused_result)); + +/** @return whether the unzip_LRU list should be used for evicting a victim +instead of the general LRU list */ +bool buf_LRU_evict_from_unzip_LRU(); + +/** Puts a block back to the free list. +@param[in] block block; not containing a file page */ +void +buf_LRU_block_free_non_file_page(buf_block_t* block); +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the page_size is +already set when invoking the function, so that we can get correct +page_size from the buffer page when adding a block into LRU */ +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /*!< in: control block */ + bool old); /*!< in: true if should be put to the old + blocks in the LRU list, else put to the + start; if the LRU list is very short, added to + the start regardless of this parameter */ +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old); /*!< in: TRUE if should be put to the end + of the list, else put to the start */ + +/** Update buf_pool.LRU_old_ratio. +@param[in] old_pct Reserve this percentage of + the buffer pool for "old" blocks +@param[in] adjust true=adjust the LRU list; + false=just assign buf_pool.LRU_old_ratio + during the initialization of InnoDB +@return updated old_pct */ +uint buf_LRU_old_ratio_update(uint old_pct, bool adjust); +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +void +buf_LRU_stat_update(); + +/** Remove one page from LRU list and put it to free list. +@param bpage file page to be freed +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) */ +void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id, + page_hash_latch *hash_lock) + MY_ATTRIBUTE((nonnull)); + +#ifdef UNIV_DEBUG +/** Validate the LRU list. */ +void buf_LRU_validate(); +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Dump the LRU list to stderr. */ +void buf_LRU_print(); +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + +/** @name Heuristics for detecting index scan @{ */ +/** The denominator of buf_pool.LRU_old_ratio. */ +#define BUF_LRU_OLD_RATIO_DIV 1024 +/** Maximum value of buf_pool.LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool.LRU_old_ratio_update */ +#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV +/** Minimum value of buf_pool.LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool.LRU_old_ratio_update +The minimum must exceed +(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */ +#define BUF_LRU_OLD_RATIO_MIN 51 + +#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX +# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX" +#endif +#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV +# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV" +#endif + +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +extern uint buf_LRU_old_threshold_ms; +/* @} */ + +/** @brief Statistics for selecting the LRU list for eviction. + +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics we decide +if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */ +struct buf_LRU_stat_t +{ + ulint io; /**< Counter of buffer pool I/O operations. */ + ulint unzip; /**< Counter of page_zip_decompress operations. */ +}; + +/** Current operation counters. Not protected by any mutex. +Cleared by buf_LRU_stat_update(). */ +extern buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Protected by buf_pool.mutex. */ +extern buf_LRU_stat_t buf_LRU_stat_sum; + +/********************************************************************//** +Increments the I/O counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++ +/********************************************************************//** +Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++ + +#endif diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h new file mode 100644 index 00000000..87c6b5d7 --- /dev/null +++ b/storage/innobase/include/buf0rea.h @@ -0,0 +1,119 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0rea.h +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0rea_h +#define buf0rea_h + +#include "buf0buf.h" + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@retval DB_SUCCESS if the page was read and is not corrupted, +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. +@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in,out] space tablespace +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] sync true if synchronous aio is desired */ +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size, bool sync) + MY_ATTRIBUTE((nonnull)); + +/** Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@param[in] page_id page id of a page which the current thread +wants to access +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +ulint +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf); + +/** Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@param[in] page_id page id; see NOTE 3 above +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine +@return number of page read requests issued */ +ulint +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); + +/** Issues read requests for pages which recovery wants to read in. +@param[in] space_id tablespace id +@param[in] page_nos array of page numbers to read, with the +highest page number the last in the array +@param[in] n number of page numbers in the array */ +void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n); + +/** @name Modes used in read-ahead @{ */ +/** read only pages belonging to the insert buffer tree */ +#define BUF_READ_IBUF_PAGES_ONLY 131 +/** read any page */ +#define BUF_READ_ANY_PAGE 132 +/* @} */ + +#endif diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h new file mode 100644 index 00000000..5dd58109 --- /dev/null +++ b/storage/innobase/include/buf0types.h @@ -0,0 +1,225 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2019, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0types.h +The database buffer pool global types for the directory + +Created 11/17/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0types_h +#define buf0types_h + +#include "univ.i" + +/** Buffer page (uncompressed or compressed) */ +class buf_page_t; +/** Buffer block for which an uncompressed page exists */ +struct buf_block_t; +/** Buffer pool statistics struct */ +struct buf_pool_stat_t; +/** Buffer pool buddy statistics struct */ +struct buf_buddy_stat_t; + +/** A buffer frame. @see page_t */ +typedef byte buf_frame_t; + +/** Flags for io_fix types */ +enum buf_io_fix { + BUF_IO_NONE = 0, /**< no pending I/O */ + BUF_IO_READ, /**< read pending */ + BUF_IO_WRITE, /**< write pending */ + BUF_IO_PIN /**< disallow relocation of + block and its removal of from + the flush_list */ +}; + +/** Alternatives for srv_checksum_algorithm, which can be changed by +setting innodb_checksum_algorithm */ +enum srv_checksum_algorithm_t { + SRV_CHECKSUM_ALGORITHM_CRC32, /*!< Write crc32, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, /*!< Write crc32, allow crc32 + when reading */ + SRV_CHECKSUM_ALGORITHM_INNODB, /*!< Write innodb, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_INNODB, /*!< Write innodb, allow + innodb when reading */ + SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_NONE, /*!< Write none, allow none + when reading */ + + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32, innodb or none when reading. */ + SRV_CHECKSUM_ALGORITHM_FULL_CRC32, + + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32 when reading. */ + SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 +}; + +inline +bool +is_checksum_strict(srv_checksum_algorithm_t algo) +{ + return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32 + || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB + || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE); +} + +inline +bool +is_checksum_strict(ulint algo) +{ + return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32 + || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB + || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE); +} + +/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ +/* @{ */ +/** Zip shift value for the smallest page size */ +#define BUF_BUDDY_LOW_SHIFT UNIV_ZIP_SIZE_SHIFT_MIN + +/** Smallest buddy page size */ +#define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT) + +/** Actual number of buddy sizes based on current page size */ +#define BUF_BUDDY_SIZES (srv_page_size_shift - BUF_BUDDY_LOW_SHIFT) + +/** Maximum number of buddy sizes based on the max page size */ +#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \ + - BUF_BUDDY_LOW_SHIFT) + +/** twice the maximum block size of the buddy system; +the underlying memory is aligned by this amount: +this must be equal to srv_page_size */ +#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES) +/* @} */ + +/** Page identifier. */ +class page_id_t +{ +public: + /** Constructor from (space, page_no). + @param[in] space tablespace id + @param[in] page_no page number */ + page_id_t(ulint space, uint32_t page_no) : m_id(uint64_t{space} << 32 | page_no) + { + ut_ad(space <= 0xFFFFFFFFU); + } + + page_id_t(uint64_t id) : m_id(id) {} + bool operator==(const page_id_t& rhs) const { return m_id == rhs.m_id; } + bool operator!=(const page_id_t& rhs) const { return m_id != rhs.m_id; } + bool operator<(const page_id_t& rhs) const { return m_id < rhs.m_id; } + bool operator>(const page_id_t& rhs) const { return m_id > rhs.m_id; } + bool operator<=(const page_id_t& rhs) const { return m_id <= rhs.m_id; } + bool operator>=(const page_id_t& rhs) const { return m_id >= rhs.m_id; } + page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; } + page_id_t &operator++() + { + ut_ad(page_no() < 0xFFFFFFFFU); + m_id++; + return *this; + } + page_id_t operator-(uint32_t i) const + { + ut_ad(page_no() >= i); + return page_id_t(m_id - i); + } + page_id_t operator+(uint32_t i) const + { + ut_ad(page_no() < ~i); + return page_id_t(m_id + i); + } + + /** Retrieve the tablespace id. + @return tablespace id */ + uint32_t space() const { return static_cast<uint32_t>(m_id >> 32); } + + /** Retrieve the page number. + @return page number */ + uint32_t page_no() const { return static_cast<uint32_t>(m_id); } + + /** Retrieve the fold value. + @return fold value */ + ulint fold() const { return (space() << 20) + space() + page_no(); } + + /** Reset the page number only. + @param[in] page_no page number */ + void set_page_no(uint32_t page_no) + { + m_id= (m_id & ~uint64_t{0} << 32) | page_no; + } + + ulonglong raw() { return m_id; } +private: + /** The page identifier */ + uint64_t m_id; +}; + +/** A 64KiB buffer of NUL bytes, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to NUL bytes, in +dtuple_convert_big_rec(). */ +extern const byte *field_ref_zero; + +#ifndef UNIV_INNOCHECKSUM + +#include "ut0mutex.h" +#include "sync0rw.h" +#include "rw_lock.h" + +class page_hash_latch : public rw_lock +{ +public: + /** Wait for a shared lock */ + void read_lock_wait(); + /** Wait for an exclusive lock */ + void write_lock_wait(); + + /** Acquire a shared lock */ + inline void read_lock(); + /** Acquire an exclusive lock */ + inline void write_lock(); + + /** Acquire a lock */ + template<bool exclusive> void acquire() + { + if (exclusive) + write_lock(); + else + read_lock(); + } + /** Release a lock */ + template<bool exclusive> void release() + { + if (exclusive) + write_unlock(); + else + read_unlock(); + } +}; + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif /* buf0types.h */ diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h new file mode 100644 index 00000000..fc774b6e --- /dev/null +++ b/storage/innobase/include/data0data.h @@ -0,0 +1,710 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, 2020 MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.h +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef data0data_h +#define data0data_h + +#include "data0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "dict0types.h" +#include "btr0types.h" +#include <vector> + +#include <ostream> + +/** Storage for overflow data in a big record, that is, a clustered +index record which needs external storage of data fields */ +struct big_rec_t; +struct upd_t; + +/** Dummy variable to catch access to uninitialized fields. In the +debug version, dtuple_create() will make all fields of dtuple_t point +to data_error. */ +ut_d(extern byte data_error); + +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type); /*!< in: pointer to data type struct */ + +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + MY_ATTRIBUTE((nonnull)); + +/** Gets spatial status for "external storage" +@param[in,out] field field */ +UNIV_INLINE +spatial_status_t +dfield_get_spatial_status( + const dfield_t* field); + +/** Sets spatial status for "external storage" +@param[in,out] field field +@param[in] spatial_status spatial status */ +UNIV_INLINE +void +dfield_set_spatial_status( + dfield_t* field, + spatial_status_t spatial_status); + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + MY_ATTRIBUTE((nonnull(1))); +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_write_mbr( +/*=============*/ + dfield_t* field, /*!< in: field */ + const double* mbr) /*!< in: data */ + MY_ATTRIBUTE((nonnull(1))); +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2); /*!< in: field to copy from */ + +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ + MY_ATTRIBUTE((nonnull)); + +/* Estimate the number of bytes that are going to be allocated when +creating a new dtuple_t object */ +#define DTUPLE_EST_ALLOC(n_fields) \ + (sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t)) + +/** Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@param[in,out] buf buffer to use +@param[in] buf_size buffer size +@param[in] n_fields number of field +@param[in] n_v_fields number of fields on virtual columns +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( + void* buf, + ulint buf_size, + ulint n_fields, + ulint n_v_fields) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields)/*!< in: number of fields */ + MY_ATTRIBUTE((nonnull, malloc)); + +/** Initialize the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields */ +UNIV_INLINE void dtuple_init_v_fld(dtuple_t* vrow); + +/** Duplicate the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields +@param[in] heap heap memory to use */ +UNIV_INLINE void dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap); + +/** Creates a data tuple with possible virtual columns to a memory heap. +@param[in] heap memory heap where the tuple is created +@param[in] n_fields number of fields +@param[in] n_v_fields number of fields on virtual col +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create_with_vcol( + mem_heap_t* heap, + ulint n_fields, + ulint n_v_fields); + +/*********************************************************************//** +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields) /*!< in: number of fields */ + MY_ATTRIBUTE((nonnull)); +/** Copies a data tuple's virtaul fields to another. This is a shallow copy; +@param[in,out] d_tuple destination tuple +@param[in] s_tuple source tuple */ +UNIV_INLINE +void +dtuple_copy_v_fields( + dtuple_t* d_tuple, + const dtuple_t* s_tuple); +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ + MY_ATTRIBUTE((nonnull, malloc)); +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. +@return sum of data lens */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull)); +/** Compare two data tuples. +@param[in] tuple1 first data tuple +@param[in] tuple2 second data tuple +@return positive, 0, negative if tuple1 is greater, equal, less, than tuple2, +respectively */ +int +dtuple_coll_cmp( + const dtuple_t* tuple1, + const dtuple_t* tuple2) + MY_ATTRIBUTE((warn_unused_result)); +/** Fold a prefix given as the number of fields of a tuple. +@param[in] tuple index record +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( + const dtuple_t* tuple, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data field is typed. Asserts an error if not. +@return TRUE if ok */ +ibool +dfield_check_typed( +/*===============*/ + const dfield_t* field) /*!< in: data field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data tuple is typed. Asserts an error if not. +@return TRUE if ok */ +ibool +dtuple_check_typed( +/*===============*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. +@return TRUE if ok */ +ibool +dtuple_validate( +/*============*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Pretty prints a dfield value according to its data type. */ +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /*!< in: dfield */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /*!< in: dfield */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************//** +The following function prints the contents of a tuple. */ +void +dtuple_print( +/*=========*/ + FILE* f, /*!< in: output stream */ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull)); + +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] field array of data fields +@param[in] n number of data fields */ +void +dfield_print( + std::ostream& o, + const dfield_t* field, + ulint n); +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] tuple data tuple */ +void +dtuple_print( + std::ostream& o, + const dtuple_t* tuple); + +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] tuple data tuple */ +inline +std::ostream& +operator<<(std::ostream& o, const dtuple_t& tuple) +{ + dtuple_print(o, &tuple); + return(o); +} + +/**************************************************************//** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. +@return own: created big record vector, NULL if we are not able to +shorten the entry enough, i.e., if there are too many fixed-length or +short fields in entry or the index is clustered */ +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + dict_index_t* index, /*!< in: index */ + upd_t* upd, /*!< in/out: update vector */ + dtuple_t* entry, /*!< in/out: index entry */ + ulint* n_ext) /*!< in/out: number of + externally stored columns */ + MY_ATTRIBUTE((malloc, warn_unused_result)); +/**************************************************************//** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: entry whose data was put to vector */ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + MY_ATTRIBUTE((nonnull)); +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + MY_ATTRIBUTE((nonnull)); + +/*######################################################################*/ + +/** Structure for an SQL data field */ +struct dfield_t{ + void* data; /*!< pointer to data */ + unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */ + unsigned spatial_status:2; + /*!< spatial status of externally stored field + in undo log for purge */ + unsigned len; /*!< data length; UNIV_SQL_NULL if SQL null */ + dtype_t type; /*!< type of data */ + + /** Create a deep copy of this object. + @param[in,out] heap memory heap in which the clone will be created + @return the cloned object */ + dfield_t* clone(mem_heap_t* heap) const; + + /** @return system field indicates history row */ + bool vers_history_row() const + { + ut_ad(type.vers_sys_end()); + if (type.mtype == DATA_FIXBINARY) { + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); + } else { + ut_ad(type.mtype == DATA_INT); + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(0); + return false; + } +}; + +/** Structure for an SQL data tuple of fields (logical record) */ +struct dtuple_t { + ulint info_bits; /*!< info bits of an index record: + the default is 0; this field is used + if an index record is built from + a data tuple */ + ulint n_fields; /*!< number of fields in dtuple */ + ulint n_fields_cmp; /*!< number of fields which should + be used in comparison services + of rem0cmp.*; the index search + is performed by comparing only these + fields, others are ignored; the + default value in dtuple creation is + the same value as n_fields */ + dfield_t* fields; /*!< fields */ + ulint n_v_fields; /*!< number of virtual fields */ + dfield_t* v_fields; /*!< fields on virtual column */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number, used in + debug assertions */ +/** Value of dtuple_t::magic_n */ +# define DATA_TUPLE_MAGIC_N 65478679 +#endif /* UNIV_DEBUG */ + + /** Trim the tail of an index tuple before insert or update. + After instant ADD COLUMN, if the last fields of a clustered index tuple + match the default values that were explicitly specified or implied + during ADD COLUMN, there will be no need to store them. + NOTE: A page latch in the index must be held, so that the index + may not lose 'instantness' before the trimmed tuple has been + inserted or updated. + @param[in] index index possibly with instantly added columns */ + void trim(const dict_index_t& index); + + bool vers_history_row() const + { + for (ulint i = 0; i < n_fields; i++) { + const dfield_t* field = &fields[i]; + if (field->type.vers_sys_end()) { + return field->vers_history_row(); + } + } + return false; + } + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_alter_metadata(ulint info_bits) + { + return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER); + } + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_metadata(ulint info_bits) + { + return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_METADATA_ADD); + } + + /** @return whether this is a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const { return is_alter_metadata(info_bits); } + + /** @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + bool is_metadata() const { return is_metadata(info_bits); } +}; + +inline ulint dtuple_get_n_fields(const dtuple_t* tuple) +{ return tuple->n_fields; } +inline dtype_t* dfield_get_type(dfield_t* field) { return &field->type; } +inline const dtype_t* dfield_get_type(const dfield_t* field) +{ return &field->type; } +inline void* dfield_get_data(dfield_t* field) +{ + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + return field->data; +} +inline const void* dfield_get_data(const dfield_t* field) +{ + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + return field->data; +} +inline ulint dfield_get_len(const dfield_t* field) { + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + ut_ad(field->len != UNIV_SQL_DEFAULT); + return field->len; +} +inline bool dfield_is_null(const dfield_t* field) +{ return field->len == UNIV_SQL_NULL; } +/** @return whether a column is to be stored off-page */ +inline bool dfield_is_ext(const dfield_t* field) +{ + ut_ad(!field->ext || field->len >= BTR_EXTERN_FIELD_REF_SIZE); + return static_cast<bool>(field->ext); +} +/** Set the "external storage" flag */ +inline void dfield_set_ext(dfield_t* field) { field->ext = 1; } + +/** Gets number of virtual fields in a data tuple. +@param[in] tuple dtuple to check +@return number of fields */ +inline ulint +dtuple_get_n_v_fields(const dtuple_t* tuple) { return tuple->n_v_fields; } + +inline const dfield_t* dtuple_get_nth_field(const dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_fields); + return &tuple->fields[n]; +} +inline dfield_t* dtuple_get_nth_field(dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_fields); + return &tuple->fields[n]; +} + +/** Get a virtual column in a table row or an extended clustered index record. +@param[in] tuple tuple +@oaran[in] n the nth virtual field to get +@return nth virtual field */ +inline const dfield_t* dtuple_get_nth_v_field(const dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_v_fields); + return &tuple->v_fields[n]; +} +/** Get a virtual column in a table row or an extended clustered index record. +@param[in] tuple tuple +@oaran[in] n the nth virtual field to get +@return nth virtual field */ +inline dfield_t* dtuple_get_nth_v_field(dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_v_fields); + return &tuple->v_fields[n]; +} + +/** A slot for a field in a big rec vector */ +struct big_rec_field_t { + + /** Constructor. + @param[in] field_no_ the field number + @param[in] len_ the data length + @param[in] data_ the data */ + big_rec_field_t(ulint field_no_, ulint len_, const void* data_) + : field_no(field_no_), + len(len_), + data(data_) + {} + + ulint field_no; /*!< field number in record */ + ulint len; /*!< stored data length, in bytes */ + const void* data; /*!< stored data */ +}; + +/** Storage format for overflow data in a big record, that is, a +clustered index record which needs external storage of data fields */ +struct big_rec_t { + mem_heap_t* heap; /*!< memory heap from which + allocated */ + const ulint capacity; /*!< fields array size */ + ulint n_fields; /*!< number of stored fields */ + big_rec_field_t*fields; /*!< stored fields */ + + /** Constructor. + @param[in] max the capacity of the array of fields. */ + explicit big_rec_t(const ulint max) + : heap(0), + capacity(max), + n_fields(0), + fields(0) + {} + + /** Append one big_rec_field_t object to the end of array of fields */ + void append(const big_rec_field_t& field) + { + ut_ad(n_fields < capacity); + fields[n_fields] = field; + n_fields++; + } + + /** Allocate a big_rec_t object in the given memory heap, and for + storing n_fld number of fields. + @param[in] heap memory heap in which this object is allocated + @param[in] n_fld maximum number of fields that can be stored in + this object + @return the allocated object */ + static big_rec_t* alloc( + mem_heap_t* heap, + ulint n_fld); +}; + +#include "data0data.ic" + +#endif diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic new file mode 100644 index 00000000..2d1bf5a2 --- /dev/null +++ b/storage/innobase/include/data0data.ic @@ -0,0 +1,633 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.ic +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0rnd.h" + +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type) /*!< in: pointer to data type struct */ +{ + ut_ad(field != NULL); + ut_ad(type != NULL); + + field->type = *type; +} + +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + ut_ad(len != UNIV_SQL_DEFAULT); + field->ext = 0; + field->len = static_cast<unsigned int>(len); +} + +/** Gets spatial status for "external storage" +@param[in,out] field field */ +UNIV_INLINE +spatial_status_t +dfield_get_spatial_status( + const dfield_t* field) +{ + ut_ad(dfield_is_ext(field)); + + return(static_cast<spatial_status_t>(field->spatial_status)); +} + +/** Sets spatial status for "external storage" +@param[in,out] field field +@param[in] spatial_status spatial status */ +UNIV_INLINE +void +dfield_set_spatial_status( + dfield_t* field, + spatial_status_t spatial_status) +{ + field->spatial_status = spatial_status & 3; + ut_ad(dfield_get_spatial_status(field) == spatial_status); +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + field->data = (void*) data; + field->ext = 0; + field->len = static_cast<unsigned int>(len); +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_write_mbr( +/*=============*/ + dfield_t* field, /*!< in: field */ + const double* mbr) /*!< in: data */ +{ + MEM_CHECK_DEFINED(mbr, sizeof *mbr); + field->ext = 0; + + for (unsigned i = 0; i < SPDIMS * 2; i++) { + mach_double_write(static_cast<byte*>(field->data) + + i * sizeof(double), mbr[i]); + } + + field->len = DATA_MBR_LEN; +} + +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ +{ + dfield_set_data(field, NULL, UNIV_SQL_NULL); +} + +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + ut_ad(field1 != NULL); + ut_ad(field2 != NULL); + + field1->data = field2->data; + field1->len = field2->len; + field1->ext = field2->ext; + field1->spatial_status = field2->spatial_status; +} + +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + *field1 = *field2; +} + +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ +{ + if (!dfield_is_null(field)) { + MEM_CHECK_DEFINED(field->data, field->len); + field->data = mem_heap_dup(heap, field->data, field->len); + } +} + +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ +{ + ulint len2 = len; + + if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) { + len = field1->len; + } + + if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) { + len2 = field2->len; + } + + return(len == len2 + && (len == UNIV_SQL_NULL + || !memcmp(field1->data, field2->data, len))); +} + +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ +{ + ut_ad(len != UNIV_SQL_DEFAULT); + return(len == dfield_get_len(field) + && (!len || len == UNIV_SQL_NULL + || !memcmp(dfield_get_data(field), data, len))); +} + +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + return(tuple->info_bits); +} + +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ +{ + tuple->info_bits = info_bits; +} + +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + return(tuple->n_fields_cmp); +} + +/*********************************************************************//** +Sets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ +{ + ut_ad(n_fields_cmp <= tuple->n_fields); + tuple->n_fields_cmp = n_fields_cmp; +} + +/** Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@param[in,out] buf buffer to use +@param[in] buf_size buffer size +@param[in] n_fields number of field +@param[in] n_v_fields number of fields on virtual columns +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( + void* buf, + ulint buf_size, + ulint n_fields, + ulint n_v_fields) +{ + dtuple_t* tuple; + ulint n_t_fields = n_fields + n_v_fields; + + ut_a(buf_size >= DTUPLE_EST_ALLOC(n_t_fields)); + + tuple = (dtuple_t*) buf; + tuple->info_bits = 0; + tuple->n_fields = n_fields; + tuple->n_v_fields = n_v_fields; + tuple->n_fields_cmp = n_fields; + tuple->fields = (dfield_t*) &tuple[1]; + if (n_v_fields > 0) { + tuple->v_fields = &tuple->fields[n_fields]; + } else { + tuple->v_fields = NULL; + } + +#ifdef UNIV_DEBUG + tuple->magic_n = DATA_TUPLE_MAGIC_N; + + { /* In the debug version, initialize fields to an error value */ + ulint i; + + for (i = 0; i < n_t_fields; i++) { + dfield_t* field; + + if (i >= n_fields) { + field = dtuple_get_nth_v_field( + tuple, i - n_fields); + } else { + field = dtuple_get_nth_field(tuple, i); + } + + dfield_set_len(field, UNIV_SQL_NULL); + field->data = &data_error; + dfield_get_type(field)->mtype = DATA_ERROR; + dfield_get_type(field)->prtype = DATA_ERROR; + } + } +#endif + MEM_CHECK_ADDRESSABLE(tuple->fields, n_t_fields + * sizeof *tuple->fields); + MEM_UNDEFINED(tuple->fields, n_t_fields * sizeof *tuple->fields); + return(tuple); +} + +/** Duplicate the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields +@param[in,out] heap heap memory to use */ +UNIV_INLINE +void +dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap) +{ + for (ulint i = 0; i < vrow->n_v_fields; i++) { + dfield_t* dfield = dtuple_get_nth_v_field(vrow, i); + dfield_dup(dfield, heap); + } +} + +/** Initialize the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields */ +UNIV_INLINE +void +dtuple_init_v_fld(dtuple_t* vrow) +{ + for (ulint i = 0; i < vrow->n_v_fields; i++) { + dfield_t* dfield = dtuple_get_nth_v_field(vrow, i); + dfield_get_type(dfield)->mtype = DATA_MISSING; + dfield_set_len(dfield, UNIV_SQL_NULL); + } +} + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields) /*!< in: number of fields */ +{ + return(dtuple_create_with_vcol(heap, n_fields, 0)); +} + +/** Creates a data tuple with virtual columns to a memory heap. +@param[in] heap memory heap where the tuple is created +@param[in] n_fields number of fields +@param[in] n_v_fields number of fields on virtual col +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create_with_vcol( + mem_heap_t* heap, + ulint n_fields, + ulint n_v_fields) +{ + void* buf; + ulint buf_size; + dtuple_t* tuple; + + ut_ad(heap); + + buf_size = DTUPLE_EST_ALLOC(n_fields + n_v_fields); + buf = mem_heap_alloc(heap, buf_size); + + tuple = dtuple_create_from_mem(buf, buf_size, n_fields, n_v_fields); + + return(tuple); +} + +/** Copies a data tuple's virtual fields to another. This is a shallow copy; +@param[in,out] d_tuple destination tuple +@param[in] s_tuple source tuple */ +UNIV_INLINE +void +dtuple_copy_v_fields( + dtuple_t* d_tuple, + const dtuple_t* s_tuple) +{ + + ulint n_v_fields = dtuple_get_n_v_fields(d_tuple); + ut_ad(n_v_fields == dtuple_get_n_v_fields(s_tuple)); + + for (ulint i = 0; i < n_v_fields; i++) { + dfield_copy(dtuple_get_nth_v_field(d_tuple, i), + dtuple_get_nth_v_field(s_tuple, i)); + } +} + +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ +{ + ulint n_fields = dtuple_get_n_fields(tuple); + ulint n_v_fields = dtuple_get_n_v_fields(tuple); + dtuple_t* new_tuple = dtuple_create_with_vcol( + heap, n_fields, n_v_fields); + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_copy(dtuple_get_nth_field(new_tuple, i), + dtuple_get_nth_field(tuple, i)); + } + + for (i = 0; i < n_v_fields; i++) { + dfield_copy(dtuple_get_nth_v_field(new_tuple, i), + dtuple_get_nth_v_field(tuple, i)); + } + + return(new_tuple); +} + +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. Neither +is possible space in externally stored parts of the field. +@return sum of data lengths */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + ulint sum = 0; + + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = tuple->n_fields; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (len == UNIV_SQL_NULL) { + len = dtype_get_sql_null_size(dfield_get_type(field), + comp); + } + + sum += len; + } + + return(sum); +} + +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of externally stored fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ulint n_ext = 0; + ulint n_fields = tuple->n_fields; + ulint i; + + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + for (i = 0; i < n_fields; i++) { + n_ext += dtuple_get_nth_field(tuple, i)->ext; + } + + return(n_ext); +} + +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ +{ + dtype_t* dfield_type; + ulint i; + + for (i = 0; i < n; i++) { + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dtype_set(dfield_type, DATA_BINARY, 0, 0); + } +} + +/** Fold a prefix given as the number of fields of a tuple. +@param[in] tuple index record +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( + const dtuple_t* tuple, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) +{ + const dfield_t* field; + ulint i; + const byte* data; + ulint len; + ulint fold; + + ut_ad(tuple); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple)); + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ +{ + memset(data, 0, len); +} + +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ +{ + ulint n; + ulint i; + + n = dtuple_get_n_fields(tuple); + + for (i = 0; i < n; i++) { + if (dfield_is_null(dtuple_get_nth_field(tuple, i))) { + + return(TRUE); + } + } + + return(FALSE); +} + +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ +{ + mem_heap_free(vector->heap); +} diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h new file mode 100644 index 00000000..750c3534 --- /dev/null +++ b/storage/innobase/include/data0type.h @@ -0,0 +1,606 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.h +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#ifndef data0type_h +#define data0type_h + +#include "univ.i" + +/** Special length indicating a missing instantly added column */ +#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1) + +/** @return whether a length is actually stored in a field */ +#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT) + +extern ulint data_mysql_default_charset_coll; +#define DATA_MYSQL_BINARY_CHARSET_COLL 63 + +/* SQL data type struct */ +struct dtype_t; + +/** SQL Like operator comparison types */ +enum ib_like_t { + IB_LIKE_EXACT, /**< e.g. STRING */ + IB_LIKE_PREFIX /**< e.g., STRING% */ +}; + +/*-------------------------------------------*/ +/* The 'MAIN TYPE' of a column */ +#define DATA_MISSING 0 /* missing column */ +#define DATA_VARCHAR 1 /* character varying of the + latin1_swedish_ci charset-collation; note + that the MySQL format for this, DATA_BINARY, + DATA_VARMYSQL, is also affected by whether the + 'precise type' contains + DATA_MYSQL_TRUE_VARCHAR */ +#define DATA_CHAR 2 /* fixed length character of the + latin1_swedish_ci charset-collation */ +#define DATA_FIXBINARY 3 /* binary string of fixed length */ +#define DATA_BINARY 4 /* binary string */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; + if prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column (or a BLOB created + with < 4.0.14; since column prefix indexes + came only in 4.0.14, the missing flag in BLOBs + created before that does not cause any harm) */ +#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ +#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ +#define DATA_SYS 8 /* system column */ + +/* Data types >= DATA_FLOAT must be compared using the whole field, not as +binary strings */ + +#define DATA_FLOAT 9 +#define DATA_DOUBLE 10 +#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ +#define DATA_VARMYSQL 12 /* any charset varying length char */ +#define DATA_MYSQL 13 /* any charset fixed length char */ + /* NOTE that 4.1.1 used DATA_MYSQL and + DATA_VARMYSQL for all character sets, and the + charset-collation for tables created with it + can also be latin1_swedish_ci */ + +/* DATA_GEOMETRY includes all standard geometry datatypes as described in +OGC standard(point, line_string, polygon, multi_point, multi_polygon, +multi_line_string, geometry_collection, geometry). +Currently, geometry data is stored in the standard Well-Known Binary(WKB) +format (http://www.opengeospatial.org/standards/sfa). +We use BLOB as the underlying datatype. */ +#define DATA_GEOMETRY 14 /* geometry datatype of variable length */ +#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() + requires the values are <= 63 */ + +#define DATA_MTYPE_CURRENT_MIN DATA_VARCHAR /* minimum value of mtype */ +#define DATA_MTYPE_CURRENT_MAX DATA_GEOMETRY /* maximum value of mtype */ +/*-------------------------------------------*/ +/* The 'PRECISE TYPE' of a column */ +/* +Tables created by a MySQL user have the following convention: + +- In the least significant byte in the precise type we store the MySQL type +code (not applicable for system columns). + +- In the second least significant byte we OR flags DATA_NOT_NULL, +DATA_UNSIGNED, DATA_BINARY_TYPE. + +- In the third least significant byte of the precise type of string types we +store the MySQL charset-collation code. In DATA_BLOB columns created with +< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there +are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no +problem, though. + +Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the +precise type, since the charset was always the default charset of the MySQL +installation. If the stored charset code is 0 in the system table SYS_COLUMNS +of InnoDB, that means that the default charset of this MySQL installation +should be used. + +When loading a table definition from the system tables to the InnoDB data +dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check +if the stored charset-collation is 0, and if that is the case and the type is +a non-binary string, replace that 0 by the default charset-collation code of +this MySQL installation. In short, in old tables, the charset-collation code +in the system tables on disk can be 0, but in in-memory data structures +(dtype_t), the charset-collation code is always != 0 for non-binary string +types. + +In new tables, in binary string types, the charset-collation code is the +MySQL code for the 'binary charset', that is, != 0. + +For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those +DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci, +InnoDB performs all comparisons internally, without resorting to the MySQL +comparison functions. This is to save CPU time. + +InnoDB's own internal system tables have different precise types for their +columns, and for them the precise type is usually not used at all. +*/ + +#define DATA_ENGLISH 4 /* English language character string: this + is a relic from pre-MySQL time and only used + for InnoDB's own system tables */ +#define DATA_ERROR 111 /* another relic from pre-MySQL time */ + +#define DATA_MYSQL_TYPE_MASK 255U/* AND with this mask to extract the MySQL + type from the precise type */ +#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3 + format true VARCHAR */ + +/* Precise data types for system columns and the length of those columns; +NOTE: the values must run from 0 up in the order given! All codes must +be less than 256 */ +#define DATA_ROW_ID 0 /* row id: a 48-bit integer */ +#define DATA_ROW_ID_LEN 6 /* stored length for row id */ + +#define DATA_TRX_ID 1 /* transaction id: 6 bytes */ +#define DATA_TRX_ID_LEN 6 + +#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */ +#define DATA_ROLL_PTR_LEN 7 + +#define DATA_N_SYS_COLS 3 /* number of system columns defined above */ + +#define DATA_FTS_DOC_ID 3 /* Used as FTS DOC ID column */ + +#define DATA_SYS_PRTYPE_MASK 0xFU /* mask to extract the above from prtype */ + +/* Flags ORed to the precise data type */ +#define DATA_NOT_NULL 256U /* this is ORed to the precise type when + the column is declared as NOT NULL */ +#define DATA_UNSIGNED 512U /* this id ORed to the precise type when + we have an unsigned integer type */ +#define DATA_BINARY_TYPE 1024U /* if the data type is a binary character + string, this is ORed to the precise type: + this only holds for tables created with + >= MySQL-4.0.14 */ +/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1. + In earlier versions this was set for some + BLOB columns. +*/ +#define DATA_GIS_MBR 2048U /* Used as GIS MBR column */ +/** the size of a GIS maximum bounding rectangle */ +constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double)); + +#define DATA_LONG_TRUE_VARCHAR 4096U /* this is ORed to the precise data + type when the column is true VARCHAR where + MySQL uses 2 bytes to store the data len; + for shorter VARCHARs MySQL uses only 1 byte */ +#define DATA_VIRTUAL 8192U /* Virtual column */ + +/** System Versioning */ +#define DATA_VERS_START 16384U /* start system field */ +#define DATA_VERS_END 32768U /* end system field */ +/** system-versioned user data column */ +#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END) + +/** Check whether locking is disabled (never). */ +#define dict_table_is_locking_disabled(table) false + +/*-------------------------------------------*/ + +/* This many bytes we need to store the type information affecting the +alphabetical order for a single field and decide the storage size of an +SQL null*/ +#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we add 2 bytes more so that we can also +store the charset-collation number; one byte is left unused, though */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 + +/* Maximum multi-byte character length in bytes, plus 1 */ +#define DATA_MBMAX 8 + +/* For checking if mtype is GEOMETRY datatype */ +#define DATA_GEOMETRY_MTYPE(mtype) ((mtype) == DATA_GEOMETRY) + +/* For checking if mtype is BLOB or GEOMETRY, since we use BLOB as +the underlying datatype of GEOMETRY data. */ +#define DATA_LARGE_MTYPE(mtype) ((mtype) == DATA_BLOB \ + || (mtype) == DATA_GEOMETRY) + +/* For checking if data type is big length data type. */ +#define DATA_BIG_LEN_MTYPE(len, mtype) ((len) > 255 || DATA_LARGE_MTYPE(mtype)) + +/* For checking if the column is a big length column. */ +#define DATA_BIG_COL(col) DATA_BIG_LEN_MTYPE((col)->len, (col)->mtype) + +/* For checking if data type is large binary data type. */ +#define DATA_LARGE_BINARY(mtype,prtype) ((mtype) == DATA_GEOMETRY || \ + ((mtype) == DATA_BLOB && !((prtype) & DATA_BINARY_TYPE))) + +/* We now support 15 bits (up to 32767) collation number */ +#define MAX_CHAR_COLL_NUM 32767 + +/* Mask to get the Charset Collation number (0x7fff) */ +#define CHAR_COLL_MASK MAX_CHAR_COLL_NUM + +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type); /*!< in: type struct */ +/*********************************************************************//** +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. +@return length of the prefix, in bytes */ +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + ulint prtype, /*!< in: precise type */ + ulint mbminlen, /*!< in: minimum length of + a multi-byte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of + a multi-byte character, in bytes */ + ulint prefix_len, /*!< in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /*!< in: length of str (in bytes) */ + const char* str); /*!< in: the string whose prefix + length is being determined */ +/** @return whether main type is a string type */ +inline bool dtype_is_string_type(ulint mtype) +{ + return mtype <= DATA_BLOB + || mtype == DATA_MYSQL || mtype == DATA_VARMYSQL; +} + +/** @return whether a type is a binary string type */ +inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype) +{ + /* Note that for tables created before MySQL 4.0.14, + we do not know if a DATA_BLOB column is a BLOB or a TEXT column. + For those DATA_BLOB columns we return false. */ + + return mtype == DATA_FIXBINARY || mtype == DATA_BINARY + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE)); +} + +/** @return whether a type is a non-binary string type */ +inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype) +{ + return dtype_is_string_type(mtype) + && !dtype_is_binary_string_type(mtype, prtype); +} + +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len); /*!< in: precision of type */ +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2); /*!< in: type struct to copy from */ +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type); /*!< in: data type */ +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type); /*!< in: data type */ + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + unsigned* mbminlen, /*!< out: minimum length of a + multi-byte character */ + unsigned* mbmaxlen); /*!< out: maximum length of a + multi-byte character */ +/** +Get the charset-collation code for string types. +@param prtype InnoDB precise type +@return charset-collation code */ +inline uint16_t dtype_get_charset_coll(ulint prtype) +{ + return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK; +} + +/** Form a precise type from the < 4.1.2 format precise type plus the +charset-collation code. +@param[in] old_prtype MySQL type code and the flags + DATA_BINARY_TYPE etc. +@param[in] charset_coll character-set collation code +@return precise type, including the charset-collation code */ +UNIV_INLINE +uint32_t +dtype_form_prtype(ulint old_prtype, ulint charset_coll) +{ + ut_ad(old_prtype < 256 * 256); + ut_ad(charset_coll <= MAX_CHAR_COLL_NUM); + return(uint32_t(old_prtype + (charset_coll << 16))); +} + +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return whether a subset of UTF-8 */ +UNIV_INLINE +bool +dtype_is_utf8( +/*==========*/ + ulint prtype);/*!< in: precise data type */ +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type); /*!< in: data type */ + +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a + multibyte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of a + multibyte character, in bytes */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ + +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +unsigned +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a character */ + ulint mbmaxlen); /*!< in: maximum length of a character */ +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len); /*!< in: length */ +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for the stored order info */ +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len);/*!< in: prefix length to + replace type->len, or 0 */ +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for stored type order info */ + +/*********************************************************************//** +Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len +@return the SQL type name */ +UNIV_INLINE +char* +dtype_sql_name( +/*===========*/ + unsigned mtype, /*!< in: mtype */ + unsigned prtype, /*!< in: prtype */ + unsigned len, /*!< in: len */ + char* name, /*!< out: SQL name */ + unsigned name_sz);/*!< in: size of the name buffer */ + +/*********************************************************************//** +Validates a data type structure. +@return TRUE if ok */ +ibool +dtype_validate( +/*===========*/ + const dtype_t* type); /*!< in: type struct to validate */ +#ifdef UNIV_DEBUG +/** Print a data type structure. +@param[in] type data type */ +void +dtype_print( + const dtype_t* type); +#endif /* UNIV_DEBUG */ + +/* Structure for an SQL data type. +If you add fields to this structure, be sure to initialize them everywhere. +This structure is initialized in the following functions: +dtype_set() +dtype_read_for_order_and_null_size() +dtype_new_read_for_order_and_null_size() +sym_tab_add_null_lit() */ + +struct dtype_t{ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + unsigned mbminlen:3; /*!< minimum length of a character, + in bytes */ + unsigned mbmaxlen:3; /*!< maximum length of a character, + in bytes */ + + /** @return whether this is system versioned user field */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system field start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system field end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } + + /** Set the type of the BLOB in the hidden metadata record. */ + void metadata_blob_init() + { + prtype = DATA_NOT_NULL; + mtype = DATA_BLOB; + len = 0; + mbminlen = 0; + mbmaxlen = 0; + } +}; + +/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ +extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + +/** Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/** The delete-mark flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL + +/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ +enum rec_comp_status_t { + /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_ORDINARY = 0, + /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_NODE_PTR = 1, + /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ + REC_STATUS_INFIMUM = 2, + /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ + REC_STATUS_SUPREMUM = 3, + /** Clustered index record that has been inserted or updated + after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ + REC_STATUS_INSTANT = 4 +}; + +/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN. +@see rec_is_metadata() +@see rec_is_alter_metadata() */ +static const byte REC_INFO_METADATA_ADD + = REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT; + +/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE. +@see rec_is_metadata() */ +static const byte REC_INFO_METADATA_ALTER + = REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG; + +#include "data0type.ic" + +#endif diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic new file mode 100644 index 00000000..b81b68e6 --- /dev/null +++ b/storage/innobase/include/data0type.ic @@ -0,0 +1,618 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.ic +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "ha_prototypes.h" + +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return whether a subset of UTF-8 */ +UNIV_INLINE +bool +dtype_is_utf8( +/*==========*/ + ulint prtype) /*!< in: precise data type */ +{ + /* These codes have been copied from strings/ctype-extra.c + and strings/ctype-utf8.c. */ + switch (dtype_get_charset_coll(prtype)) { + case 11: /* ascii_general_ci */ + case 65: /* ascii_bin */ + case 33: /* utf8_general_ci */ + case 83: /* utf8_bin */ + case 254: /* utf8_general_cs */ + return true; + } + + return false; +} + +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type) /*!< in: type struct */ +{ + return(type->prtype & 0xFFUL); +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + unsigned*mbminlen, /*!< out: minimum length of a + multi-byte character */ + unsigned*mbmaxlen) /*!< out: maximum length of a + multi-byte character */ +{ + if (dtype_is_string_type(mtype)) { + innobase_get_cset_width(dtype_get_charset_coll(prtype), + mbminlen, mbmaxlen); + ut_ad(*mbminlen <= *mbmaxlen); + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + *mbminlen = *mbmaxlen = 0; + } +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_set_mblen( +/*============*/ + dtype_t* type) /*!< in/out: type */ +{ + unsigned mbminlen, mbmaxlen; + + dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen); + type->mbminlen = mbminlen & 7; + type->mbmaxlen = mbmaxlen & 7; + + ut_ad(dtype_validate(type)); +} + +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision of type */ +{ + ut_ad(type); + ut_ad(mtype <= DATA_MTYPE_MAX); + + type->mtype = static_cast<byte>(mtype); + type->prtype = static_cast<unsigned>(prtype); + type->len = static_cast<uint16_t>(len); + + dtype_set_mblen(type); +} + +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2) /*!< in: type struct to copy from */ +{ + *type1 = *type2; + + ut_ad(dtype_validate(type1)); +} + +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->mtype); +} + +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->prtype); +} + +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->len); +} + +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + return type->mbminlen; +} +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + return type->mbmaxlen; +} + +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len)/*!< in: prefix length to + replace type->len, or 0 */ +{ + compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ulint len; + + ut_ad(type); + ut_ad(type->mtype >= DATA_VARCHAR); + ut_ad(type->mtype <= DATA_MTYPE_MAX); + + buf[0] = (byte)(type->mtype & 0xFFUL); + + if (type->prtype & DATA_BINARY_TYPE) { + buf[0] |= 128; + } + + /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { + buf[0] |= 64; + } + */ + + buf[1] = (byte)(type->prtype & 0xFFUL); + + len = prefix_len ? prefix_len : type->len; + + mach_write_to_2(buf + 2, len & 0xFFFFUL); + + ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM); + mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); + + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the < 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ + compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + type->len = mach_read_from_2(buf + 2); + + type->prtype = dtype_form_prtype(type->prtype, + data_mysql_default_charset_coll); + dtype_set_mblen(type); +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the >= 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ + compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; + } + + type->len = mach_read_from_2(buf + 2); + + ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; + + if (dtype_is_string_type(type->mtype)) { + ut_a(charset_coll <= MAX_CHAR_COLL_NUM); + + if (charset_coll == 0) { + /* This insert buffer record was inserted with MySQL + version < 4.1.2, and the charset-collation code was not + explicitly stored to dtype->prtype at that time. It + must be the default charset-collation of this MySQL + installation. */ + + charset_coll = data_mysql_default_charset_coll; + } + + type->prtype = dtype_form_prtype(type->prtype, charset_coll); + } + dtype_set_mblen(type); +} + +/*********************************************************************//** +Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len +@return the SQL type name */ +UNIV_INLINE +char* +dtype_sql_name( +/*===========*/ + unsigned mtype, /*!< in: mtype */ + unsigned prtype, /*!< in: prtype */ + unsigned len, /*!< in: len */ + char* name, /*!< out: SQL name */ + unsigned name_sz)/*!< in: size of the name buffer */ +{ + +#define APPEND_UNSIGNED() \ + do { \ + if (prtype & DATA_UNSIGNED) { \ + snprintf(name + strlen(name), \ + name_sz - strlen(name), \ + " UNSIGNED"); \ + } \ + } while (0) + + snprintf(name, name_sz, "UNKNOWN"); + + switch (mtype) { + case DATA_INT: + switch (len) { + case 1: + snprintf(name, name_sz, "TINYINT"); + break; + case 2: + snprintf(name, name_sz, "SMALLINT"); + break; + case 3: + snprintf(name, name_sz, "MEDIUMINT"); + break; + case 4: + snprintf(name, name_sz, "INT"); + break; + case 8: + snprintf(name, name_sz, "BIGINT"); + break; + } + APPEND_UNSIGNED(); + break; + case DATA_FLOAT: + snprintf(name, name_sz, "FLOAT"); + APPEND_UNSIGNED(); + break; + case DATA_DOUBLE: + snprintf(name, name_sz, "DOUBLE"); + APPEND_UNSIGNED(); + break; + case DATA_FIXBINARY: + snprintf(name, name_sz, "BINARY(%u)", len); + break; + case DATA_CHAR: + case DATA_MYSQL: + snprintf(name, name_sz, "CHAR(%u)", len); + break; + case DATA_VARCHAR: + case DATA_VARMYSQL: + snprintf(name, name_sz, "VARCHAR(%u)", len); + break; + case DATA_BINARY: + snprintf(name, name_sz, "VARBINARY(%u)", len); + break; + case DATA_GEOMETRY: + snprintf(name, name_sz, "GEOMETRY"); + break; + case DATA_BLOB: + switch (len) { + case 9: + snprintf(name, name_sz, "TINYBLOB"); + break; + case 10: + snprintf(name, name_sz, "BLOB"); + break; + case 11: + snprintf(name, name_sz, "MEDIUMBLOB"); + break; + case 12: + snprintf(name, name_sz, "LONGBLOB"); + break; + } + } + + if (prtype & DATA_NOT_NULL) { + snprintf(name + strlen(name), + name_sz - strlen(name), + " NOT NULL"); + } + + return(name); +} + +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a + multibyte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of a + multibyte character, in bytes */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + /* fall through */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return static_cast<unsigned>(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return static_cast<unsigned>(len); + } else if (!comp) { + return static_cast<unsigned>(len); + } else { +#ifdef UNIV_DEBUG + unsigned i_mbminlen, i_mbmaxlen; + + innobase_get_cset_width( + dtype_get_charset_coll(prtype), + &i_mbminlen, &i_mbmaxlen); + + ut_ad(i_mbminlen == mbminlen); + ut_ad(i_mbmaxlen == mbmaxlen); +#endif /* UNIV_DEBUG */ + if (mbminlen == mbmaxlen) { + return static_cast<unsigned>(len); + } + } + /* Treat as variable-length. */ + /* fall through */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_GEOMETRY: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +unsigned +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a character */ + ulint mbmaxlen) /*!< in: maximum length of a character */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + /* fall through */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return static_cast<unsigned>(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return static_cast<unsigned>(len); + } else { + if (mbminlen == mbmaxlen) { + return static_cast<unsigned>(len); + } + + /* this is a variable-length character set */ + ut_a(mbminlen > 0); + ut_a(mbmaxlen > mbminlen); + ut_a(len % mbmaxlen == 0); + return static_cast<unsigned>( + len * mbminlen / mbmaxlen); + } + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_GEOMETRY: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len) /*!< in: length */ +{ + switch (mtype) { + case DATA_SYS: + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_MYSQL: + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + return(len); + case DATA_GEOMETRY: + case DATA_BLOB: + break; + default: + ut_error; + } + + return(ULINT_MAX); +} + +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + type->mbminlen, type->mbmaxlen, comp)); +} diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h new file mode 100644 index 00000000..bcd6b8bc --- /dev/null +++ b/storage/innobase/include/data0types.h @@ -0,0 +1,36 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0types.h +Some type definitions + +Created 9/21/2000 Heikki Tuuri +*************************************************************************/ + +#ifndef data0types_h +#define data0types_h + +/* SQL data field struct */ +struct dfield_t; + +/* SQL data tuple struct */ +struct dtuple_t; + +#endif + diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h new file mode 100644 index 00000000..6cfc63f4 --- /dev/null +++ b/storage/innobase/include/db0err.h @@ -0,0 +1,178 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/db0err.h +Global error codes for the database + +Created 5/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef db0err_h +#define db0err_h + +/* Do not include univ.i because univ.i includes this. */ + +enum dberr_t { + DB_SUCCESS, + + DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new + explicit record lock was created */ + + /* The following are error codes */ + DB_ERROR = 11, + DB_INTERRUPTED, + DB_OUT_OF_MEMORY, + DB_OUT_OF_FILE_SPACE, + DB_LOCK_WAIT, + DB_DEADLOCK, + DB_ROLLBACK, + DB_DUPLICATE_KEY, + DB_MISSING_HISTORY, /*!< required history data has been + deleted due to lack of space in + rollback segment */ + DB_CLUSTER_NOT_FOUND = 30, + DB_TABLE_NOT_FOUND, + DB_MUST_GET_MORE_FILE_SPACE, /*!< the database has to be stopped + and restarted with more file space */ + DB_TABLE_IS_BEING_USED, + DB_TOO_BIG_RECORD, /*!< a record in an index would not fit + on a compressed page, or it would + become bigger than 1/2 free space in + an uncompressed page frame */ + DB_LOCK_WAIT_TIMEOUT, /*!< lock wait lasted too long */ + DB_NO_REFERENCED_ROW, /*!< referenced key value not found + for a foreign key in an insert or + update of a row */ + DB_ROW_IS_REFERENCED, /*!< cannot delete or update a row + because it contains a key value + which is referenced */ + DB_CANNOT_ADD_CONSTRAINT, /*!< adding a foreign key constraint + to a table failed */ + DB_CORRUPTION, /*!< data structure corruption + noticed */ + DB_CANNOT_DROP_CONSTRAINT, /*!< dropping a foreign key constraint + from a table failed */ + DB_NO_SAVEPOINT, /*!< no savepoint exists with the given + name */ + DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table + tablespace because a file of the same + name already exists */ + DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is + being dropped right now */ + DB_TABLESPACE_NOT_FOUND, /*<! Attempt to delete a tablespace + instance that was not found in the + tablespace hash table */ + DB_LOCK_TABLE_FULL, /*!< lock structs have exhausted the + buffer pool (for big transactions, + InnoDB stores the lock structs in the + buffer pool) */ + DB_FOREIGN_DUPLICATE_KEY, /*!< foreign key constraints + activated by the operation would + lead to a duplicate key in some + table */ + DB_TOO_MANY_CONCURRENT_TRXS, /*!< when InnoDB runs out of the + preconfigured undo slots, this can + only happen when there are too many + concurrent transactions */ + DB_UNSUPPORTED, /*!< when InnoDB sees any artefact or + a feature that it can't recoginize or + work with e.g., FT indexes created by + a later version of the engine. */ + + DB_INVALID_NULL, /*!< a NOT NULL column was found to + be NULL during table rebuild */ + + DB_STATS_DO_NOT_EXIST, /*!< an operation that requires the + persistent storage, used for recording + table and index statistics, was + requested but this storage does not + exist itself or the stats for a given + table do not exist */ + DB_FOREIGN_EXCEED_MAX_CASCADE, /*!< Foreign key constraint related + cascading delete/update exceeds + maximum allowed depth */ + DB_CHILD_NO_INDEX, /*!< the child (foreign) table does + not have an index that contains the + foreign keys as its prefix columns */ + DB_PARENT_NO_INDEX, /*!< the parent table does not + have an index that contains the + foreign keys as its prefix columns */ + DB_TOO_BIG_INDEX_COL, /*!< index column size exceeds + maximum limit */ + DB_INDEX_CORRUPT, /*!< we have corrupted index */ + DB_UNDO_RECORD_TOO_BIG, /*!< the undo log record is too big */ + DB_READ_ONLY, /*!< Update operation attempted in + a read-only transaction */ + DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */ + DB_TABLE_IN_FK_CHECK, /* table is being used in foreign + key check */ + DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big + during online index creation */ + + DB_IDENTIFIER_TOO_LONG, /*!< Identifier name too long */ + DB_FTS_EXCEED_RESULT_CACHE_LIMIT, /*!< FTS query memory + exceeds result cache limit */ + DB_TEMP_FILE_WRITE_FAIL, /*!< Temp file write failure */ + DB_CANT_CREATE_GEOMETRY_OBJECT, /*!< Cannot create specified Geometry + data object */ + DB_CANNOT_OPEN_FILE, /*!< Cannot open a file */ + DB_FTS_TOO_MANY_WORDS_IN_PHRASE, + /*< Too many words in a phrase */ + + DB_DECRYPTION_FAILED, /* Tablespace encrypted and + decrypt operation failed because + of missing key management plugin, + or missing or incorrect key or + incorret AES method or algorithm. */ + + DB_IO_ERROR = 100, /*!< Generic IO error */ + + DB_IO_PARTIAL_FAILED, /*!< Partial IO request failed */ + + DB_FORCED_ABORT, /*!< Transaction was forced to rollback + by a higher priority transaction */ + + DB_TABLE_CORRUPT, /*!< Table/clustered index is + corrupted */ + + DB_COMPUTE_VALUE_FAILED, /*!< Compute generated value failed */ + + DB_NO_FK_ON_S_BASE_COL, /*!< Cannot add foreign constrain + placed on the base column of + stored column */ + + DB_IO_NO_PUNCH_HOLE, /*!< Punch hole not supported by + file system. */ + + DB_PAGE_CORRUPTED, /* Page read from tablespace is + corrupted. */ + /* The following are partial failure codes */ + DB_FAIL = 1000, + DB_OVERFLOW, + DB_UNDERFLOW, + DB_STRONG_FAIL, + DB_ZIP_OVERFLOW, + DB_RECORD_NOT_FOUND = 1500, + DB_END_OF_INDEX, + DB_NOT_FOUND, /*!< Generic error code for "Not found" + type of errors */ +}; + +#endif diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h new file mode 100644 index 00000000..0f96df8f --- /dev/null +++ b/storage/innobase/include/dict0boot.h @@ -0,0 +1,330 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0boot.h +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0boot_h +#define dict0boot_h + +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "buf0buf.h" +#include "dict0dict.h" + +/** @return the DICT_HDR block, x-latched */ +buf_block_t *dict_hdr_get(mtr_t* mtr); +/**********************************************************************//** +Returns a new table, index, or space id. */ +void +dict_hdr_get_new_id( +/*================*/ + table_id_t* table_id, /*!< out: table id + (not assigned if NULL) */ + index_id_t* index_id, /*!< out: index id + (not assigned if NULL) */ + ulint* space_id); /*!< out: space id + (not assigned if NULL) */ +/**********************************************************************//** +Writes the current value of the row id counter to the dictionary header file +page. */ +void +dict_hdr_flush_row_id(void); +/*=======================*/ +/**********************************************************************//** +Returns a new row id. +@return the new id */ +UNIV_INLINE +row_id_t +dict_sys_get_new_row_id(void); +/*=========================*/ +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /*!< in: record field */ + row_id_t row_id);/*!< in: row id */ +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +dberr_t +dict_boot(void) +/*===========*/ + MY_ATTRIBUTE((warn_unused_result)); + +/*****************************************************************//** +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ +dberr_t +dict_create(void) +/*=============*/ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ + MY_ATTRIBUTE((warn_unused_result)); + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ids for the basic system tables and their indexes */ +#define DICT_TABLES_ID 1 +#define DICT_COLUMNS_ID 2 +#define DICT_INDEXES_ID dict_index_t::DICT_INDEXES_ID /* 3 */ +#define DICT_FIELDS_ID 4 +/* The following is a secondary index on SYS_TABLES */ +#define DICT_TABLE_IDS_ID 5 + +/* The offset of the dictionary header on the page */ +#define DICT_HDR FSEG_PAGE_DATA + +/*-------------------------------------------------------------*/ +/* Dictionary header offsets */ +#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */ +#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */ +#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */ +#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/ +#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID*/ +#define DICT_HDR_TABLES 32 /* Root of SYS_TABLES clust index */ +#define DICT_HDR_TABLE_IDS 36 /* Root of SYS_TABLE_IDS sec index */ +#define DICT_HDR_COLUMNS 40 /* Root of SYS_COLUMNS clust index */ +#define DICT_HDR_INDEXES 44 /* Root of SYS_INDEXES clust index */ +#define DICT_HDR_FIELDS 48 /* Root of SYS_FIELDS clust index */ + +#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace + segment into which the dictionary + header is created */ +/*-------------------------------------------------------------*/ + +/* The columns in SYS_TABLES */ +enum dict_col_sys_tables_enum { + DICT_COL__SYS_TABLES__NAME = 0, + DICT_COL__SYS_TABLES__ID = 1, + DICT_COL__SYS_TABLES__N_COLS = 2, + DICT_COL__SYS_TABLES__TYPE = 3, + DICT_COL__SYS_TABLES__MIX_ID = 4, + DICT_COL__SYS_TABLES__MIX_LEN = 5, + DICT_COL__SYS_TABLES__CLUSTER_ID = 6, + DICT_COL__SYS_TABLES__SPACE = 7, + DICT_NUM_COLS__SYS_TABLES = 8 +}; +/* The field numbers in the SYS_TABLES clustered index */ +enum dict_fld_sys_tables_enum { + DICT_FLD__SYS_TABLES__NAME = 0, + DICT_FLD__SYS_TABLES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLES__ID = 3, + DICT_FLD__SYS_TABLES__N_COLS = 4, + DICT_FLD__SYS_TABLES__TYPE = 5, + DICT_FLD__SYS_TABLES__MIX_ID = 6, + DICT_FLD__SYS_TABLES__MIX_LEN = 7, + DICT_FLD__SYS_TABLES__CLUSTER_ID = 8, + DICT_FLD__SYS_TABLES__SPACE = 9, + DICT_NUM_FIELDS__SYS_TABLES = 10 +}; +/* The field numbers in the SYS_TABLE_IDS index */ +enum dict_fld_sys_table_ids_enum { + DICT_FLD__SYS_TABLE_IDS__ID = 0, + DICT_FLD__SYS_TABLE_IDS__NAME = 1, + DICT_NUM_FIELDS__SYS_TABLE_IDS = 2 +}; +/* The columns in SYS_COLUMNS */ +enum dict_col_sys_columns_enum { + DICT_COL__SYS_COLUMNS__TABLE_ID = 0, + DICT_COL__SYS_COLUMNS__POS = 1, + DICT_COL__SYS_COLUMNS__NAME = 2, + DICT_COL__SYS_COLUMNS__MTYPE = 3, + DICT_COL__SYS_COLUMNS__PRTYPE = 4, + DICT_COL__SYS_COLUMNS__LEN = 5, + DICT_COL__SYS_COLUMNS__PREC = 6, + DICT_NUM_COLS__SYS_COLUMNS = 7 +}; +/* The field numbers in the SYS_COLUMNS clustered index */ +enum dict_fld_sys_columns_enum { + DICT_FLD__SYS_COLUMNS__TABLE_ID = 0, + DICT_FLD__SYS_COLUMNS__POS = 1, + DICT_FLD__SYS_COLUMNS__DB_TRX_ID = 2, + DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_COLUMNS__NAME = 4, + DICT_FLD__SYS_COLUMNS__MTYPE = 5, + DICT_FLD__SYS_COLUMNS__PRTYPE = 6, + DICT_FLD__SYS_COLUMNS__LEN = 7, + DICT_FLD__SYS_COLUMNS__PREC = 8, + DICT_NUM_FIELDS__SYS_COLUMNS = 9 +}; +/* The columns in SYS_INDEXES */ +enum dict_col_sys_indexes_enum { + DICT_COL__SYS_INDEXES__TABLE_ID = 0, + DICT_COL__SYS_INDEXES__ID = 1, + DICT_COL__SYS_INDEXES__NAME = 2, + DICT_COL__SYS_INDEXES__N_FIELDS = 3, + DICT_COL__SYS_INDEXES__TYPE = 4, + DICT_COL__SYS_INDEXES__SPACE = 5, + DICT_COL__SYS_INDEXES__PAGE_NO = 6, + DICT_COL__SYS_INDEXES__MERGE_THRESHOLD = 7, + DICT_NUM_COLS__SYS_INDEXES = 8 +}; +/* The field numbers in the SYS_INDEXES clustered index */ +enum dict_fld_sys_indexes_enum { + DICT_FLD__SYS_INDEXES__TABLE_ID = 0, + DICT_FLD__SYS_INDEXES__ID = 1, + DICT_FLD__SYS_INDEXES__DB_TRX_ID = 2, + DICT_FLD__SYS_INDEXES__DB_ROLL_PTR = 3, + DICT_FLD__SYS_INDEXES__NAME = 4, + DICT_FLD__SYS_INDEXES__N_FIELDS = 5, + DICT_FLD__SYS_INDEXES__TYPE = 6, + DICT_FLD__SYS_INDEXES__SPACE = 7, + DICT_FLD__SYS_INDEXES__PAGE_NO = 8, + DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD = 9, + DICT_NUM_FIELDS__SYS_INDEXES = 10 +}; +/* The columns in SYS_FIELDS */ +enum dict_col_sys_fields_enum { + DICT_COL__SYS_FIELDS__INDEX_ID = 0, + DICT_COL__SYS_FIELDS__POS = 1, + DICT_COL__SYS_FIELDS__COL_NAME = 2, + DICT_NUM_COLS__SYS_FIELDS = 3 +}; +/* The field numbers in the SYS_FIELDS clustered index */ +enum dict_fld_sys_fields_enum { + DICT_FLD__SYS_FIELDS__INDEX_ID = 0, + DICT_FLD__SYS_FIELDS__POS = 1, + DICT_FLD__SYS_FIELDS__DB_TRX_ID = 2, + DICT_FLD__SYS_FIELDS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FIELDS__COL_NAME = 4, + DICT_NUM_FIELDS__SYS_FIELDS = 5 +}; +/* The columns in SYS_FOREIGN */ +enum dict_col_sys_foreign_enum { + DICT_COL__SYS_FOREIGN__ID = 0, + DICT_COL__SYS_FOREIGN__FOR_NAME = 1, + DICT_COL__SYS_FOREIGN__REF_NAME = 2, + DICT_COL__SYS_FOREIGN__N_COLS = 3, + DICT_NUM_COLS__SYS_FOREIGN = 4 +}; +/* The field numbers in the SYS_FOREIGN clustered index */ +enum dict_fld_sys_foreign_enum { + DICT_FLD__SYS_FOREIGN__ID = 0, + DICT_FLD__SYS_FOREIGN__DB_TRX_ID = 1, + DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR = 2, + DICT_FLD__SYS_FOREIGN__FOR_NAME = 3, + DICT_FLD__SYS_FOREIGN__REF_NAME = 4, + DICT_FLD__SYS_FOREIGN__N_COLS = 5, + DICT_NUM_FIELDS__SYS_FOREIGN = 6 +}; +/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */ +enum dict_fld_sys_foreign_for_name_enum { + DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME = 0, + DICT_FLD__SYS_FOREIGN_FOR_NAME__ID = 1, + DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME = 2 +}; +/* The columns in SYS_FOREIGN_COLS */ +enum dict_col_sys_foreign_cols_enum { + DICT_COL__SYS_FOREIGN_COLS__ID = 0, + DICT_COL__SYS_FOREIGN_COLS__POS = 1, + DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME = 2, + DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME = 3, + DICT_NUM_COLS__SYS_FOREIGN_COLS = 4 +}; +/* The field numbers in the SYS_FOREIGN_COLS clustered index */ +enum dict_fld_sys_foreign_cols_enum { + DICT_FLD__SYS_FOREIGN_COLS__ID = 0, + DICT_FLD__SYS_FOREIGN_COLS__POS = 1, + DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID = 2, + DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME = 4, + DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5, + DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6 +}; +/* The columns in SYS_TABLESPACES */ +enum dict_col_sys_tablespaces_enum { + DICT_COL__SYS_TABLESPACES__SPACE = 0, + DICT_COL__SYS_TABLESPACES__NAME = 1, + DICT_COL__SYS_TABLESPACES__FLAGS = 2, + DICT_NUM_COLS__SYS_TABLESPACES = 3 +}; +/* The field numbers in the SYS_TABLESPACES clustered index */ +enum dict_fld_sys_tablespaces_enum { + DICT_FLD__SYS_TABLESPACES__SPACE = 0, + DICT_FLD__SYS_TABLESPACES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLESPACES__NAME = 3, + DICT_FLD__SYS_TABLESPACES__FLAGS = 4, + DICT_NUM_FIELDS__SYS_TABLESPACES = 5 +}; +/* The columns in SYS_DATAFILES */ +enum dict_col_sys_datafiles_enum { + DICT_COL__SYS_DATAFILES__SPACE = 0, + DICT_COL__SYS_DATAFILES__PATH = 1, + DICT_NUM_COLS__SYS_DATAFILES = 2 +}; +/* The field numbers in the SYS_DATAFILES clustered index */ +enum dict_fld_sys_datafiles_enum { + DICT_FLD__SYS_DATAFILES__SPACE = 0, + DICT_FLD__SYS_DATAFILES__DB_TRX_ID = 1, + DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_DATAFILES__PATH = 3, + DICT_NUM_FIELDS__SYS_DATAFILES = 4 +}; + +/* The columns in SYS_VIRTUAL */ +enum dict_col_sys_virtual_enum { + DICT_COL__SYS_VIRTUAL__TABLE_ID = 0, + DICT_COL__SYS_VIRTUAL__POS = 1, + DICT_COL__SYS_VIRTUAL__BASE_POS = 2, + DICT_NUM_COLS__SYS_VIRTUAL = 3 +}; +/* The field numbers in the SYS_VIRTUAL clustered index */ +enum dict_fld_sys_virtual_enum { + DICT_FLD__SYS_VIRTUAL__TABLE_ID = 0, + DICT_FLD__SYS_VIRTUAL__POS = 1, + DICT_FLD__SYS_VIRTUAL__BASE_POS = 2, + DICT_FLD__SYS_VIRTUAL__DB_TRX_ID = 3, + DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR = 4, + DICT_NUM_FIELDS__SYS_VIRTUAL = 5 +}; + +/* A number of the columns above occur in multiple tables. These are the +length of thos fields. */ +#define DICT_FLD_LEN_SPACE 4 +#define DICT_FLD_LEN_FLAGS 4 + +/* When a row id which is zero modulo this number (which must be a power of +two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is +updated */ +#define DICT_HDR_ROW_ID_WRITE_MARGIN 256 + +#include "dict0boot.ic" + +#endif diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic new file mode 100644 index 00000000..d920bdde --- /dev/null +++ b/storage/innobase/include/dict0boot.ic @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0boot.ic +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Returns a new row id. +@return the new id */ +UNIV_INLINE +row_id_t +dict_sys_get_new_row_id(void) +/*=========================*/ +{ + row_id_t id; + + mutex_enter(&dict_sys.mutex); + + id = dict_sys.row_id; + + if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) { + + dict_hdr_flush_row_id(); + } + + dict_sys.row_id++; + + mutex_exit(&dict_sys.mutex); + + return(id); +} + +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /*!< in: record field */ + row_id_t row_id) /*!< in: row id */ +{ + compile_time_assert(DATA_ROW_ID_LEN == 6); + mach_write_to_6(field, row_id); +} + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ +{ + return(id < DICT_HDR_FIRST_ID); +} + + diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h new file mode 100644 index 00000000..13706d6b --- /dev/null +++ b/storage/innobase/include/dict0crea.h @@ -0,0 +1,324 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.h +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0crea_h +#define dict0crea_h + +#include "dict0dict.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" +#include "fil0crypt.h" + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as + a memory data structure */ + mem_heap_t* heap, /*!< in: heap where created */ + fil_encryption_t mode, /*!< in: encryption mode */ + uint32_t key_id); /*!< in: encryption key_id */ + +/** Creates an index create graph. +@param[in] index index to create, built as a memory data structure +@param[in] table table name +@param[in,out] heap heap where created +@param[in] add_v new virtual columns added in the same clause with + add index +@return own: index create node */ +ind_node_t* +ind_create_graph_create( + dict_index_t* index, + const char* table, + mem_heap_t* heap, + const dict_add_v_col_t* add_v = NULL); + +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ + +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ + +/***************************************************************//** +Builds an index definition but doesn't update sys_table. +@return DB_SUCCESS or error code */ +void +dict_build_index_def( +/*=================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx); /*!< in/out: InnoDB transaction + handle */ +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +Don't update SYSTEM TABLES. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +dict_create_index_tree( +/*===================*/ + dict_index_t* index, /*!< in/out: index */ + const trx_t* trx); /*!< in: InnoDB transaction handle */ + +/** Drop the index tree associated with a row in SYS_INDEXES table. +@param[in,out] pcur persistent cursor on rec +@param[in,out] trx dictionary transaction +@param[in,out] mtr mini-transaction */ +void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); + +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +Don't update SYSTEM TABLES. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +dict_create_index_tree_in_mem( +/*==========================*/ + dict_index_t* index, /*!< in/out: index */ + const trx_t* trx); /*!< in: InnoDB transaction handle */ + +/****************************************************************//** +Creates the foreign key constraints system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +dberr_t +dict_create_or_check_foreign_constraint_tables(void); +/*================================================*/ + +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id + generation; incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign); /*!< in/out: foreign key */ + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check if a foreign constraint is on columns server as base columns +of any stored column. This is to prevent creating SET NULL or CASCADE +constraint on such columns +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@return true if yes, otherwise, false */ +bool +dict_foreigns_has_s_base_col( + const dict_foreign_set& local_fk_set, + const dict_table_t* table); + +/****************************************************************//** +Creates the tablespaces and datafiles system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +dberr_t +dict_create_or_check_sys_tablespace(void); +/*=====================================*/ +/** Creates the virtual column system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +dberr_t +dict_create_or_check_sys_virtual(); + +/** Put a tablespace definition into the data dictionary, +replacing what was there previously. +@param[in] space Tablespace id +@param[in] name Tablespace name +@param[in] flags Tablespace flags +@param[in] path Tablespace path +@param[in] trx Transaction +@return error code or DB_SUCCESS */ +dberr_t +dict_replace_tablespace_in_dictionary( + ulint space_id, + const char* name, + ulint flags, + const char* path, + trx_t* trx); + +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Construct foreign key constraint defintion from data dictionary information. +*/ +UNIV_INTERN +char* +dict_foreign_def_get( +/*=================*/ + dict_foreign_t* foreign,/*!< in: foreign */ + trx_t* trx); /*!< in: trx */ + +/* Table create node structure */ +struct tab_node_t{ + que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */ + dict_table_t* table; /*!< table to create, built as a + memory data structure with + dict_mem_... functions */ + ins_node_t* tab_def; /*!< child node which does the insert of + the table definition; the row to be + inserted is built by the parent node */ + ins_node_t* col_def; /*!< child node which does the inserts + of the column definitions; the row to + be inserted is built by the parent + node */ + ins_node_t* v_col_def; /*!< child node which does the inserts + of the sys_virtual row definitions; + the row to be inserted is built by + the parent node */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + ulint col_no; /*!< next column definition to insert */ + uint key_id; /*!< encryption key_id */ + fil_encryption_t mode; /*!< encryption mode */ + ulint base_col_no; /*!< next base column to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary + storage */ +}; + +/* Table create node states */ +#define TABLE_BUILD_TABLE_DEF 1 +#define TABLE_BUILD_COL_DEF 2 +#define TABLE_BUILD_V_COL_DEF 3 +#define TABLE_ADD_TO_CACHE 4 +#define TABLE_COMPLETED 5 + +/* Index create node struct */ + +struct ind_node_t{ + que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */ + dict_index_t* index; /*!< index to create, built as a + memory data structure with + dict_mem_... functions */ + const char* table_name; /*!< table name */ + ins_node_t* ind_def; /*!< child node which does the insert of + the index definition; the row to be + inserted is built by the parent node */ + ins_node_t* field_def; /*!< child node which does the inserts + of the field definitions; the row to + be inserted is built by the parent + node */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + uint32_t page_no; /* root page number of the index */ + dict_table_t* table; /*!< table which owns the index */ + dtuple_t* ind_row; /* index definition row built */ + ulint field_no; /* next field definition to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary + storage */ + const dict_add_v_col_t* + add_v; /*!< new virtual columns that being + added along with an add index call */ +}; + +/** Compose a column number for a virtual column, stored in the "POS" field +of Sys_columns. The column number includes both its virtual column sequence +(the "nth" virtual column) and its actual column position in original table +@param[in] v_pos virtual column sequence +@param[in] col_pos column position in original table definition +@return composed column position number */ +UNIV_INLINE +ulint +dict_create_v_col_pos( + ulint v_pos, + ulint col_pos); + +/** Get the column number for a virtual column (the column position in +original table), stored in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return column position in original table */ +UNIV_INLINE +ulint +dict_get_v_col_mysql_pos( + ulint pos); + +/** Get a virtual column sequence (the "nth" virtual column) for a +virtual column, stord in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return virtual column sequence */ +UNIV_INLINE +ulint +dict_get_v_col_pos( + ulint pos); + +/* Index create node states */ +#define INDEX_BUILD_INDEX_DEF 1 +#define INDEX_BUILD_FIELD_DEF 2 +#define INDEX_CREATE_INDEX_TREE 3 +#define INDEX_ADD_TO_CACHE 4 + +#include "dict0crea.ic" + +#endif diff --git a/storage/innobase/include/dict0crea.ic b/storage/innobase/include/dict0crea.ic new file mode 100644 index 00000000..5641206d --- /dev/null +++ b/storage/innobase/include/dict0crea.ic @@ -0,0 +1,136 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.ic +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "ha_prototypes.h" + +#include "mem0mem.h" + +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign)/*!< in/out: foreign key */ +{ + DBUG_ENTER("dict_create_add_foreign_id"); + + if (foreign->id == NULL) { + /* Generate a new constraint id */ + ulint namelen = strlen(name); + char* id = static_cast<char*>( + mem_heap_alloc(foreign->heap, + namelen + 20)); + + if (dict_table_t::is_temporary_name(name)) { + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", name, + (ulong) (*id_nr)++); + } else { + char table_name[MAX_TABLE_NAME_LEN + 21]; + uint errors = 0; + + strncpy(table_name, name, (sizeof table_name) - 1); + table_name[(sizeof table_name) - 1] = '\0'; + + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + strncpy(table_name, name, + (sizeof table_name) - 1); + table_name[(sizeof table_name) - 1] = '\0'; + } + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", table_name, + (ulong) (*id_nr)++); + + if (innobase_check_identifier_length( + strchr(id,'/') + 1)) { + DBUG_RETURN(DB_IDENTIFIER_TOO_LONG); + } + } + foreign->id = id; + + DBUG_PRINT("dict_create_add_foreign_id", + ("generated foreign id: %s", id)); + } + + + DBUG_RETURN(DB_SUCCESS); +} + +/** Compose a column number for a virtual column, stored in the "POS" field +of Sys_columns. The column number includes both its virtual column sequence +(the "nth" virtual column) and its actual column position in original table +@param[in] v_pos virtual column sequence +@param[in] col_pos column position in original table definition +@return composed column position number */ +UNIV_INLINE +ulint +dict_create_v_col_pos( + ulint v_pos, + ulint col_pos) +{ + ut_ad(v_pos <= REC_MAX_N_FIELDS); + ut_ad(col_pos <= REC_MAX_N_FIELDS); + + return(((v_pos + 1) << 16) + col_pos); +} + +/** Get the column number for a virtual column (the column position in +original table), stored in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return column position in original table */ +UNIV_INLINE +ulint +dict_get_v_col_mysql_pos( + ulint pos) +{ + return(pos & 0xFFFF); +} + +/** Get a virtual column sequence (the "nth" virtual column) for a +virtual column, stord in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return virtual column sequence */ +UNIV_INLINE +ulint +dict_get_v_col_pos( + ulint pos) +{ + return((pos >> 16) - 1); +} diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h new file mode 100644 index 00000000..3aea41b0 --- /dev/null +++ b/storage/innobase/include/dict0defrag_bg.h @@ -0,0 +1,106 @@ +/***************************************************************************** + +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0defrag_bg.h +Code used for background table and index +defragmentation + +Created 25/08/2016 Jan Lindström +*******************************************************/ + +#ifndef dict0defrag_bg_h +#define dict0defrag_bg_h + +#include "dict0types.h" + +/** Indices whose defrag stats need to be saved to persistent storage.*/ +struct defrag_pool_item_t { + table_id_t table_id; + index_id_t index_id; +}; + +/** Allocator type, used by std::vector */ +typedef ut_allocator<defrag_pool_item_t> + defrag_pool_allocator_t; + +/** The multitude of tables to be defragmented- an STL vector */ +typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t> + defrag_pool_t; + +/** Pool where we store information on which tables are to be processed +by background defragmentation. */ +extern defrag_pool_t defrag_pool; + +/*****************************************************************//** +Initialize the defrag pool, called once during thread initialization. */ +void +dict_defrag_pool_init(void); +/*========================*/ + +/*****************************************************************//** +Free the resources occupied by the defrag pool, called once during +thread de-initialization. */ +void +dict_defrag_pool_deinit(void); +/*==========================*/ + +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index); /*!< in: index to remove */ + +/*****************************************************************//** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +void +dict_defrag_process_entries_from_defrag_pool(); +/*===========================================*/ + +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_defrag_summary( +/*============================*/ + dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_defrag_stats( +/*============================*/ + dict_index_t* index); /*!< in: index */ +#endif /* dict0defrag_bg_h */ diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h new file mode 100644 index 00000000..e17da733 --- /dev/null +++ b/storage/innobase/include/dict0dict.h @@ -0,0 +1,1804 @@ +/***************************************************************************** + +Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0dict.h +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0dict_h +#define dict0dict_h + +#include "data0data.h" +#include "dict0mem.h" +#include "fsp0fsp.h" +#include <deque> + +class MDL_ticket; +extern bool innodb_table_stats_not_found; +extern bool innodb_index_stats_not_found; + +/** the first table or index ID for other than hard-coded system tables */ +constexpr uint8_t DICT_HDR_FIRST_ID= 10; + + +/** Get the database name length in a table name. +@param name filename-safe encoded table name "dbname/tablename" +@return database name length */ +inline size_t dict_get_db_name_len(const char *name) +{ + /* table_name_t::dblen() would assert that '/' is contained */ + if (const char* s= strchr(name, '/')) + return size_t(s - name); + + return 0; +} + + +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len,/*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap, /*!< in: heap memory */ + CHARSET_INFO* from_cs); /*!< in: table name charset */ +/*********************************************************************//** +Frees a foreign key struct. */ +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign); /*!< in, own: foreign key struct */ +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table); /*!< in: table in the dictionary + memory cache */ +/** Check whether the dict_table_t is a partition. +A partitioned table on the SQL level is composed of InnoDB tables, +where each InnoDB table is a [sub]partition including its secondary indexes +which belongs to the partition. +@param[in] table Table to check. +@return true if the dict_table_t is a partition else false. */ +UNIV_INLINE +bool +dict_table_is_partition(const dict_table_t* table) +{ + /* Check both P and p on all platforms in case it was moved to/from + WIN. */ + return (strstr(table->name.m_name, "#p#") + || strstr(table->name.m_name, "#P#")); +} +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Operation to perform when opening a table */ +enum dict_table_op_t { + /** Expect the tablespace to exist. */ + DICT_TABLE_OP_NORMAL = 0, + /** Drop any orphan indexes after an aborted online index creation */ + DICT_TABLE_OP_DROP_ORPHAN, + /** Silently load the tablespace if it does not exist, + and do not load the definitions of incomplete indexes. */ + DICT_TABLE_OP_LOAD_TABLESPACE, + /** Open the table only if it's in table cache. */ + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED +}; + +/** Acquire MDL shared for the table name. +@tparam trylock whether to use non-blocking operation +@param[in,out] table table object +@param[in,out] thd background thread +@param[out] mdl mdl ticket +@param[in] table_op operation to perform when opening +@return table object after locking MDL shared +@retval NULL if the table is not readable, or if trylock && MDL blocked */ +template<bool trylock> +dict_table_t* +dict_acquire_mdl_shared(dict_table_t *table, + THD *thd, + MDL_ticket **mdl, + dict_table_op_t table_op= DICT_TABLE_OP_NORMAL); + +/** Look up a table by numeric identifier. +@param[in] table_id table identifier +@param[in] dict_locked data dictionary locked +@param[in] table_op operation to perform when opening +@param[in,out] thd background thread, or NULL to not acquire MDL +@param[out] mdl mdl ticket, or NULL +@return table, NULL if does not exist */ +dict_table_t* +dict_table_open_on_id(table_id_t table_id, bool dict_locked, + dict_table_op_t table_op, THD *thd= nullptr, + MDL_ticket **mdl= nullptr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Decrements the count of open handles of a table. +@param[in,out] table table +@param[in] dict_locked data dictionary locked +@param[in] try_drop try to drop any orphan indexes after + an aborted online index creation +@param[in] thd thread to release MDL +@param[in] mdl metadata lock or NULL if the thread is a + foreground one. */ +void +dict_table_close( + dict_table_t* table, + bool dict_locked, + bool try_drop, + THD* thd = NULL, + MDL_ticket* mdl = NULL); + +/*********************************************************************//** +Closes the only open handle to a table and drops a table while assuring +that dict_sys.mutex is held the whole time. This assures that the table +is not evicted after the close when the count of open handles goes to zero. +Because dict_sys.mutex is held, we do not need to call prevent_eviction(). */ +void +dict_table_close_and_drop( +/*======================*/ + trx_t* trx, /*!< in: data dictionary transaction */ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type); /*!< out: data type */ + +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note that if !dict_table_has_atomic_blobs(table), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Determine maximum bytes of a virtual column need to be stored +in the undo log. +@param[in] table dict_table_t for the table +@param[in] col_no virtual column number +@return maximum bytes of virtual column to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_v_field_len_store_undo( + dict_table_t* table, + ulint col_no); + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +unsigned +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +unsigned +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +unsigned +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Gets the column position in the given index. +@param[in] col table column +@param[in] index index to be searched for column +@return position of column in the given index. */ +UNIV_INLINE +ulint +dict_col_get_index_pos( + const dict_col_t* col, + const dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Unconditionally set the AUTO_INCREMENT counter. +@param[in,out] table table or partition +@param[in] value next available AUTO_INCREMENT value */ +MY_ATTRIBUTE((nonnull)) +UNIV_INLINE +void +dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value) +{ + table->autoinc = value; +} + +/** +@param[in] table table or partition +@return the next AUTO_INCREMENT counter value +@retval 0 if AUTO_INCREMENT is not yet initialized */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +UNIV_INLINE +ib_uint64_t +dict_table_autoinc_read(const dict_table_t* table) +{ + return(table->autoinc); +} + +/** Update the AUTO_INCREMENT sequence if the value supplied is greater +than the current value. +@param[in,out] table table or partition +@param[in] value AUTO_INCREMENT value that was assigned to a row +@return whether the AUTO_INCREMENT sequence was updated */ +MY_ATTRIBUTE((nonnull)) +UNIV_INLINE +bool +dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value) +{ + if (value > table->autoinc) { + + table->autoinc = value; + return(true); + } + + return(false); +} + +/**********************************************************************//** +Adds system columns to a table object. */ +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char* new_name, /*!< in: new name */ + bool rename_also_foreigns, + /*!< in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ + bool replace_new_file = false) + /*!< in: whether to replace the + file with the new name + (as part of rolling back TRUNCATE) */ + MY_ATTRIBUTE((nonnull)); + +/** Removes an index from the dictionary cache. +@param[in,out] table table whose index to remove +@param[in,out] index index to remove, this object is destroyed and must not +be accessed by the caller afterwards */ +void +dict_index_remove_from_cache( + dict_table_t* table, + dict_index_t* index); + +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of foreign table or referenced table must already be in +the dictionary cache! +@return DB_SUCCESS or error code */ +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); +/*********************************************************************//** +Checks if a table is referenced by foreign keys. +@return TRUE if table is referenced by a foreign key */ +ibool +dict_table_is_referenced_by_foreign_key( +/*====================================*/ + const dict_table_t* table) /*!< in: InnoDB table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************************//** +Returns a table object and increments its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low +is usually the appropriate function. +@param[in] table_name Table name +@param[in] dict_locked TRUE=data dictionary locked +@param[in] try_drop TRUE=try to drop any orphan indexes after + an aborted online index creation +@param[in] ignore_err error to be ignored when loading the table +@return table, NULL if does not exist */ +dict_table_t* +dict_table_open_on_name( + const char* table_name, + ibool dict_locked, + ibool try_drop, + dict_err_ignore_t ignore_err) + MY_ATTRIBUTE((warn_unused_result)); + +/** Outcome of dict_foreign_find_index() or dict_foreign_qualify_index() */ +enum fkerr_t +{ + /** A backing index was found for a FOREIGN KEY constraint */ + FK_SUCCESS = 0, + /** There is no index that covers the columns in the constraint. */ + FK_INDEX_NOT_FOUND, + /** The index is for a prefix index, not a full column. */ + FK_IS_PREFIX_INDEX, + /** A condition of SET NULL conflicts with a NOT NULL column. */ + FK_COL_NOT_NULL, + /** The column types do not match */ + FK_COLS_NOT_EQUAL +}; + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error = NULL, /*!< out: error code */ + ulint* err_col_no = NULL, + /*!< out: column number where + error happened */ + dict_index_t** err_index = NULL) + /*!< out: index where error + happened */ + + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); + +/** Returns a virtual column's name. +@param[in] table table object +@param[in] col_nr virtual column number(nth virtual column) +@return column name. */ +const char* +dict_table_get_v_col_name( + const dict_table_t* table, + ulint col_nr); + +/** Check if the table has a given column. +@param[in] table table object +@param[in] col_name column name +@param[in] col_nr column number guessed, 0 as default +@return column number if the table has the specified column, +otherwise table->n_def */ +ulint +dict_table_has_column( + const dict_table_t* table, + const char* col_name, + ulint col_nr = 0); + +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +std::string +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table); /*!< in: table */ + +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +std::string +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline); /*!< in: whether to add a newline */ + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +bool +dict_foreign_qualify_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error, /*!< out: error code */ + ulint* err_col_no, + /*!< out: column number where + error happened */ + dict_index_t** err_index) + /*!< out: index where error + happened */ + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes) +# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes) +# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index) +#endif /* UNIV_DEBUG */ + +/* Skip corrupted index */ +#define dict_table_skip_corrupt_index(index) \ + while (index && index->is_corrupted()) { \ + index = dict_table_get_next_index(index); \ + } + +/* Get the next non-corrupt index */ +#define dict_table_next_uncorrupted_index(index) \ +do { \ + index = dict_table_get_next_index(index); \ + dict_table_skip_corrupt_index(index); \ +} while (0) + +#define dict_index_is_clust(index) (index)->is_clust() +#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust() +#define dict_index_is_unique(index) (index)->is_unique() +#define dict_index_is_spatial(index) (index)->is_spatial() +#define dict_index_is_ibuf(index) (index)->is_ibuf() +#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary() +#define dict_index_has_virtual(index) (index)->has_virtual() + +/** Get all the FTS indexes on a table. +@param[in] table table +@param[out] indexes all FTS indexes on this table +@return number of FTS indexes */ +ulint +dict_table_get_all_fts_indexes( + const dict_table_t* table, + ib_vector_t* indexes); + +/********************************************************************//** +Gets the number of user-defined non-virtual columns in a table in the +dictionary cache. +@return number of user-defined (e.g., not ROW_ID) non-virtual +columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Gets the number of all non-virtual columns (also system) in a table +in the dictionary cache. +@return number of columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Gets the number of virtual columns in a table in the dictionary cache. +@param[in] table the table to check +@return number of virtual columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_v_cols( + const dict_table_t* table); + +/** Check if a table has indexed virtual columns +@param[in] table the table to check +@return true is the table has indexed virtual columns */ +UNIV_INLINE +bool +dict_table_has_indexed_v_cols( + const dict_table_t* table); + +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + MY_ATTRIBUTE((nonnull)); +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + MY_ATTRIBUTE((nonnull)); + +/** Get nth virtual column +@param[in] table target table +@param[in] col_nr column number in MySQL Table definition +@return dict_v_col_t ptr */ +dict_v_col_t* +dict_table_get_nth_v_col_mysql( + const dict_table_t* table, + ulint col_nr); + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Gets the nth virtual column of a table. +@param[in] table table +@param[in] pos position of virtual column +@return pointer to virtual column object */ +UNIV_INLINE +dict_v_col_t* +dict_table_get_nth_v_col( + const dict_table_t* table, + ulint pos); +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +#define dict_table_get_nth_col(table, pos) (&(table)->cols[pos]) +#define dict_table_get_sys_col(table, sys) \ + &(table)->cols[(table)->n_cols + (sys) - DATA_N_SYS_COLS] +/* Get nth virtual columns */ +#define dict_table_get_nth_v_col(table, pos) (&(table)->v_cols[pos]) +#endif /* UNIV_DEBUG */ +/** Wrapper function. +@see dict_col_t::name() +@param[in] table table +@param[in] col_nr column number in table +@return column name */ +inline +const char* +dict_table_get_col_name(const dict_table_t* table, ulint col_nr) +{ + return(dict_table_get_nth_col(table, col_nr)->name(*table)); +} + +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +unsigned +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +unsigned +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define dict_table_is_comp(table) (table)->not_redundant() + +/** Determine if a table uses atomic BLOBs (no locally stored prefix). +@param[in] table InnoDB table +@return whether BLOBs are atomic */ +inline +bool +dict_table_has_atomic_blobs(const dict_table_t* table) +{ + return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags)); +} + +/** @return potential max length stored inline for externally stored fields */ +inline size_t dict_table_t::get_overflow_field_local_len() const +{ + if (dict_table_has_atomic_blobs(this)) { + /* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not + store any BLOB prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE; + } + /* up to MySQL 5.1: store a 768-byte prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; +} + +/** Set the various values in a dict_table_t::flags pointer. +@param[in,out] flags, Pointer to a 4 byte Table Flags +@param[in] format, File Format +@param[in] zip_ssize Zip Shift Size +@param[in] use_data_dir Table uses DATA DIRECTORY +@param[in] page_compressed Table uses page compression +@param[in] page_compression_level Page compression level */ +UNIV_INLINE +void +dict_tf_set( + ulint* flags, + rec_format_t format, + ulint zip_ssize, + bool use_data_dir, + bool page_compressed, + ulint page_compression_level); + +/** Convert a 32 bit integer table flags to the 32 bit FSP Flags. +Fsp Flags are written into the tablespace header at the offset +FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field. +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@param[in] table_flags dict_table_t::flags +@return tablespace flags (fil_space_t::flags) */ +UNIV_INLINE +ulint +dict_tf_to_fsp_flags(ulint table_flags) + MY_ATTRIBUTE((const)); + + +/** Extract the ROW_FORMAT=COMPRESSED page size from table flags. +@param[in] flags flags +@return ROW_FORMAT=COMPRESSED page size +@retval 0 if not compressed */ +inline ulint dict_tf_get_zip_size(ulint flags) +{ + flags &= DICT_TF_MASK_ZIP_SSIZE; + return flags + ? (UNIV_ZIP_SIZE_MIN >> 1) + << (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE + << FSP_FLAGS_POS_ZIP_SSIZE)) + : 0; +} + +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*******************************************************************//** +Check if the table has an FTS index. +@return TRUE if table has an FTS index */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Copies types of virtual columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). +@param[in,out] tuple data tuple +@param[in] table table +*/ +void +dict_table_copy_v_types( + dtuple_t* tuple, + const dict_table_t* table); + +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Looks for an index with the given id. NOTE that we do not reserve +the dictionary mutex: this function is for emergency purposes like +printing info of a corrupt database page! +@return index or NULL if not found from cache */ +dict_index_t* +dict_index_find_on_id_low( +/*======================*/ + index_id_t id) /*!< in: index id */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Make room in the table cache by evicting an unused table. The unused table +should not be part of FK relationship and currently not used in any user +transaction. There is no guarantee that it will remove a table. +@return number of tables evicted. */ +ulint +dict_make_room_in_cache( +/*====================*/ + ulint max_tables, /*!< in: max tables allowed in cache */ + ulint pct_check); /*!< in: max percent to check */ + +/** Adds an index to the dictionary cache, with possible indexing newly +added column. +@param[in,out] index index; NOTE! The index memory + object is freed in this function! +@param[in] page_no root page number of the index +@param[in] add_v virtual columns being added along with ADD INDEX +@return DB_SUCCESS, or DB_CORRUPTION */ +dberr_t +dict_index_add_to_cache( + dict_index_t*& index, + ulint page_no, + const dict_add_v_col_t* add_v = NULL) + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** The number of fields in the nonleaf page of spatial index, except +the page no field. */ +#define DICT_INDEX_SPATIAL_NODEPTR_SIZE 1 +/** +Gets the number of fields on nonleaf page level in the internal representation +of an index which uniquely determine the position of an index entry in the +index, if we also take multiversioning into account. Note, it doesn't +include page no field. +@param[in] index index +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree_nonleaf( + const dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation we add the row id to the ordering fields to make all indexes +unique, but this function returns the number of fields the user defined +in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos)) +#endif /* UNIV_DEBUG */ +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the column number of the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Looks for column n in an index. +@param[in] index index +@param[in] n column number +@param[in] inc_prefix true=consider column prefixes too +@param[in] is_virtual true==virtual column +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_col_or_prefix_pos( + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + bool inc_prefix, /*!< in: TRUE=consider + column prefixes too */ + bool is_virtual, /*!< in: is a virtual column + */ + ulint* prefix_col_pos) /*!< out: col num if prefix + */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +unsigned +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); +/*******************************************************************//** +Adds a column to index. */ +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + const dict_table_t* table, /*!< in: table */ + dict_col_t* col, /*!< in: column */ + ulint prefix_len) /*!< in: column prefix length */ + MY_ATTRIBUTE((nonnull)); + +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys.mutex is already being held. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ + MY_ATTRIBUTE((warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Whether and when to allow temporary index names */ +enum check_name { + /** Require all indexes to be complete. */ + CHECK_ALL_COMPLETE, + /** Allow aborted online index creation. */ + CHECK_ABORTED_OK, + /** Allow partial indexes to exist. */ + CHECK_PARTIAL_OK +}; +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ + MY_ATTRIBUTE((nonnull)); +#endif /* UNIV_DEBUG */ +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Convert a physical record into a search tuple. +@param[in] rec index record (not necessarily in an index page) +@param[in] index index +@param[in] leaf whether rec is in a leaf page +@param[in] n_fields number of data fields +@param[in,out] heap memory heap for allocation +@return own: data tuple */ +dtuple_t* +dict_index_build_data_tuple( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + ulint n_fields, + mem_heap_t* heap) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +uint32_t +dict_index_get_page( +/*================*/ + const dict_index_t* tree) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the read-write lock of the index tree. +@return read-write lock */ +UNIV_INLINE +rw_lock_t* +dict_index_get_lock( +/*================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void); +/*==============================*/ + +/* Online index creation @{ */ +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ + MY_ATTRIBUTE((nonnull)); +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Calculates the minimum record length in an index. */ +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define dict_mutex_enter_for_mysql() mutex_enter(&dict_sys.mutex) +#define dict_mutex_exit_for_mysql() mutex_exit(&dict_sys.mutex) + +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Get an index by name. +@param[in] table the table where to look for the index +@param[in] name the index name to look for +@return index, NULL if does not exist */ +dict_index_t* +dict_table_get_index_on_name(dict_table_t* table, const char* name) + MY_ATTRIBUTE((warn_unused_result)); + +/** Get an index by name. +@param[in] table the table where to look for the index +@param[in] name the index name to look for +@return index, NULL if does not exist */ +inline +const dict_index_t* +dict_table_get_index_on_name(const dict_table_t* table, const char* name) +{ + return dict_table_get_index_on_name(const_cast<dict_table_t*>(table), + name); +} + +/*************************************************************** +Check whether a column exists in an FTS index. */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + /* out: ULINT_UNDEFINED if no match else + the offset within the vector */ + ib_vector_t* indexes,/* in: vector containing only FTS indexes */ + ulint col_no, /* in: col number to search for */ + bool is_virtual)/*!< in: whether it is a virtual column */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Looks for an index with the given id given a table instance. +@param[in] table table instance +@param[in] id index id +@return index or NULL */ +dict_index_t* +dict_table_find_index_on_id( + const dict_table_t* table, + index_id_t id) + MY_ATTRIBUTE((nonnull(1))); + +/** Maximum number of columns in a foreign key constraint. Please Note MySQL +has a much lower limit on the number of columns allowed in a foreign key +constraint */ +#define MAX_NUM_FK_COLUMNS 500 + +/* Buffers for storing detailed information about the latest foreign key +and unique key errors */ +extern FILE* dict_foreign_err_file; +extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the + foreign key error messages */ + +/** InnoDB data dictionary cache */ +class dict_sys_t +{ +public: + DictSysMutex mutex; /*!< mutex protecting the data + dictionary; protects also the + disk-based dictionary system tables; + this mutex serializes CREATE TABLE + and DROP TABLE, as well as reading + the dictionary data for a table from + system tables */ + /** @brief the data dictionary rw-latch protecting dict_sys + + Table create, drop, etc. reserve this in X-mode; implicit or + backround operations purge, rollback, foreign key checks reserve this + in S-mode; not all internal InnoDB operations are covered by MDL. + + This latch also prevents lock waits when accessing the InnoDB + data dictionary tables. @see trx_t::dict_operation_lock_mode */ + rw_lock_t latch; + row_id_t row_id; /*!< the next row id to assign; + NOTE that at a checkpoint this + must be written to the dict system + header and flushed to a file; in + recovery this must be derived from + the log records */ + hash_table_t table_hash; /*!< hash table of the tables, based + on name */ + /** hash table of persistent table IDs */ + hash_table_t table_id_hash; + dict_table_t* sys_tables; /*!< SYS_TABLES table */ + dict_table_t* sys_columns; /*!< SYS_COLUMNS table */ + dict_table_t* sys_indexes; /*!< SYS_INDEXES table */ + dict_table_t* sys_fields; /*!< SYS_FIELDS table */ + dict_table_t* sys_virtual; /*!< SYS_VIRTUAL table */ + + /*=============================*/ + UT_LIST_BASE_NODE_T(dict_table_t) + table_LRU; /*!< List of tables that can be evicted + from the cache */ + UT_LIST_BASE_NODE_T(dict_table_t) + table_non_LRU; /*!< List of tables that can't be + evicted from the cache */ +private: + bool m_initialised; + /** the sequence of temporary table IDs */ + std::atomic<table_id_t> temp_table_id; + /** hash table of temporary table IDs */ + hash_table_t temp_id_hash; +public: + /** @return a new temporary table ID */ + table_id_t get_temporary_table_id() { + return temp_table_id.fetch_add(1, std::memory_order_relaxed); + } + + /** Look up a temporary table. + @param id temporary table ID + @return temporary table + @retval NULL if the table does not exist + (should only happen during the rollback of CREATE...SELECT) */ + dict_table_t* get_temporary_table(table_id_t id) + { + ut_ad(mutex_own(&mutex)); + dict_table_t* table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + if (UNIV_LIKELY(table != NULL)) { + DBUG_ASSERT(table->is_temporary()); + DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID); + table->acquire(); + } + return table; + } + + /** Look up a persistent table. + @param id table ID + @return table + @retval NULL if not cached */ + dict_table_t* get_table(table_id_t id) + { + ut_ad(mutex_own(&mutex)); + dict_table_t* table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, + table, + ut_ad(table->cached), table->id == id); + DBUG_ASSERT(!table || !table->is_temporary()); + return table; + } + + /** + Constructor. Further initialisation happens in create(). + */ + + dict_sys_t() : m_initialised(false), temp_table_id(DICT_HDR_FIRST_ID) {} + + bool is_initialised() const { return m_initialised; } + + /** Initialise the data dictionary cache. */ + void create(); + + /** Close the data dictionary cache on shutdown. */ + void close(); + + /** Resize the hash tables based on the current buffer pool size. */ + void resize(); + + /** Add a table definition to the data dictionary cache */ + inline void add(dict_table_t* table); + /** Remove a table definition from the data dictionary cache. + @param[in,out] table cached table definition to be evicted + @param[in] lru whether this is part of least-recently-used evictiono + @param[in] keep whether to keep (not free) the object */ + void remove(dict_table_t* table, bool lru = false, bool keep = false); + +#ifdef UNIV_DEBUG + /** Find a table */ + template <bool in_lru> bool find(dict_table_t* table) + { + ut_ad(table); + ut_ad(table->can_be_evicted == in_lru); + ut_ad(mutex_own(&mutex)); + for (const dict_table_t* t = UT_LIST_GET_FIRST(in_lru + ? table_LRU : table_non_LRU); + t; t = UT_LIST_GET_NEXT(table_LRU, t)) + { + if (t == table) return true; + ut_ad(t->can_be_evicted == in_lru); + } + return false; + } + /** Find a table */ + bool find(dict_table_t* table) + { + return table->can_be_evicted ? find<true>(table) : find<false>(table); + } +#endif + + /** Move a table to the non-LRU list from the LRU list. */ + void prevent_eviction(dict_table_t* table) + { + ut_ad(find(table)); + if (table->can_be_evicted) + { + table->can_be_evicted = FALSE; + UT_LIST_REMOVE(table_LRU, table); + UT_LIST_ADD_LAST(table_non_LRU, table); + } + } + /** Acquire a reference to a cached table. */ + inline void acquire(dict_table_t* table); + +#ifdef UNIV_DEBUG + /** Assert that the data dictionary is locked */ + void assert_locked() + { + ut_ad(mutex_own(&mutex)); + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + } +#endif + /** Lock the data dictionary cache. */ + void lock(const char* file, unsigned line) + { + rw_lock_x_lock_func(&latch, 0, file, line); + mutex_enter_loc(&mutex, file, line); + } + + /** Unlock the data dictionary cache. */ + void unlock() + { + mutex_exit(&mutex); + rw_lock_x_unlock(&latch); + } + + /** Estimate the used memory occupied by the data dictionary + table and index objects. + @return number of bytes occupied */ + ulint rough_size() const + { + /* No mutex; this is a very crude approximation anyway */ + ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU); + size *= sizeof(dict_table_t) + + sizeof(dict_index_t) * 2 + + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10 + + sizeof(dict_field_t) * 5 /* total number of key fields */ + + 200; /* arbitrary, covering names and overhead */ + size += (table_hash.n_cells + table_id_hash.n_cells + + temp_id_hash.n_cells) * sizeof(hash_cell_t); + return size; + } +}; + +/** the data dictionary cache */ +extern dict_sys_t dict_sys; + +#define dict_table_prevent_eviction(table) dict_sys.prevent_eviction(table) +#define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__) +#define dict_sys_unlock() dict_sys.unlock() + +/* Auxiliary structs for checking a table definition @{ */ + +/* This struct is used to specify the name and type that a column must +have when checking a table's schema. */ +struct dict_col_meta_t { + const char* name; /* column name */ + ulint mtype; /* required column main type */ + ulint prtype_mask; /* required column precise type mask; + if this is non-zero then all the + bits it has set must also be set + in the column's prtype */ + ulint len; /* required column length */ +}; + +/* This struct is used for checking whether a given table exists and +whether it has a predefined schema (number of columns and column names +and types) */ +struct dict_table_schema_t { + const char* table_name; /* the name of the table whose + structure we are checking */ + ulint n_cols; /* the number of columns the + table must have */ + dict_col_meta_t* columns; /* metadata for the columns; + this array has n_cols + elements */ + ulint n_foreign; /* number of foreign keys this + table has, pointing to other + tables (where this table is + FK child) */ + ulint n_referenced; /* number of foreign keys other + tables have, pointing to this + table (where this table is + parent) */ +}; +/* @} */ + +/*********************************************************************//** +Checks whether a table exists and whether it has the given structure. +The table must have the same number of columns with the same names and +types. The order of the columns does not matter. +The caller must own the dictionary mutex. +dict_table_schema_check() @{ +@return DB_SUCCESS if the table exists and contains the necessary columns */ +dberr_t +dict_table_schema_check( +/*====================*/ + dict_table_schema_t* req_schema, /*!< in/out: required table + schema */ + char* errstr, /*!< out: human readable error + message if != DB_SUCCESS and + != DB_TABLE_NOT_FOUND is + returned */ + size_t errstr_sz) /*!< in: errstr size */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/* @} */ + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Check whether the table is corrupted. +@return nonzero for corrupted table, zero for valid tables */ +UNIV_INLINE +ulint +dict_table_is_corrupted( +/*====================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Flags an index and table corrupted both in the data dictionary cache +and in the system table SYS_INDEXES. */ +void +dict_set_corrupted( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx, /*!< in/out: transaction */ + const char* ctx) /*!< in: context */ + ATTRIBUTE_COLD __attribute__((nonnull)); + +/** Flags an index corrupted in the data dictionary cache only. This +is used mostly to mark a corrupted index when index's own dictionary +is corrupted, and we force to load such index for repair purpose +@param[in,out] index index that is corrupted */ +void +dict_set_corrupted_index_cache_only( + dict_index_t* index); + +/**********************************************************************//** +Flags a table with specified space_id corrupted in the table dictionary +cache. +@return TRUE if successful */ +bool dict_set_corrupted_by_space(const fil_space_t* space); + +/** Flag a table encrypted in the data dictionary cache. */ +void dict_set_encrypted_by_space(const fil_space_t* space); + +/** Sets merge_threshold in the SYS_INDEXES +@param[in,out] index index +@param[in] merge_threshold value to set */ +void +dict_index_set_merge_threshold( + dict_index_t* index, + ulint merge_threshold); + +#ifdef UNIV_DEBUG +/** Sets merge_threshold for all indexes in dictionary cache for debug. +@param[in] merge_threshold_all value to set for all indexes */ +void +dict_set_merge_threshold_all_debug( + uint merge_threshold_all); +#endif /* UNIV_DEBUG */ + +/** Validate the table flags. +@param[in] flags Table flags +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( + ulint flags); + +/** Validate both table flags and table flags2 and make sure they +are compatible. +@param[in] flags Table flags +@param[in] flags2 Table flags2 +@return true if valid. */ +UNIV_INLINE +bool +dict_tf2_is_valid( + ulint flags, + ulint flags2); + +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page may not compress*/ +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Convert table flag to row format string. +@return row format name */ +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag); /*!< in: row format setting */ + +/** encode number of columns and number of virtual columns in one +4 bytes value. We could do this because the number of columns in +InnoDB is limited to 1017 +@param[in] n_col number of non-virtual column +@param[in] n_v_col number of virtual column +@return encoded value */ +UNIV_INLINE +ulint +dict_table_encode_n_col( + ulint n_col, + ulint n_v_col); + +/** Decode number of virtual and non-virtual columns in one 4 bytes value. +@param[in] encoded encoded value +@param[in,out] n_col number of non-virtual column +@param[in,out] n_v_col number of virtual column */ +UNIV_INLINE +void +dict_table_decode_n_col( + ulint encoded, + ulint* n_col, + ulint* n_v_col); + +/** Free the virtual column template +@param[in,out] vc_templ virtual column template */ +UNIV_INLINE +void +dict_free_vc_templ( + dict_vcol_templ_t* vc_templ); + +/** Check whether the table have virtual index. +@param[in] table InnoDB table +@return true if the table have virtual index, false otherwise. */ +UNIV_INLINE +bool +dict_table_have_virtual_index( + dict_table_t* table); + +#include "dict0dict.ic" + +#endif diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic new file mode 100644 index 00000000..eda639ba --- /dev/null +++ b/storage/innobase/include/dict0dict.ic @@ -0,0 +1,1248 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0dict.ic +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "fsp0sysspace.h" +#include "dict0pagecompress.h" + +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return col->mbminlen; +} +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return col->mbmaxlen; +} +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type) /*!< out: data type */ +{ + ut_ad(col != NULL); + ut_ad(type != NULL); + + type->mtype = col->mtype; + type->prtype = col->prtype; + type->len = col->len; + type->mbminlen = col->mbminlen; + type->mbmaxlen = col->mbmaxlen; +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(col->mtype == type->mtype); + ut_ad(col->prtype == type->prtype); + //ut_ad(col->len == type->len); + ut_ad(col->mbminlen == type->mbminlen); + ut_ad(col->mbmaxlen == type->mbmaxlen); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +unsigned +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_min_size_low(col->mtype, col->prtype, col->len, + col->mbminlen, col->mbmaxlen)); +} +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_max_size_low(col->mtype, col->len)); +} +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len, + col->mbminlen, col->mbmaxlen, comp)); +} +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +unsigned +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dict_col_get_fixed_size(col, comp)); +} + +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +unsigned +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ +{ + return(col->ind); +} + +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ +{ + ulint i; + + ut_ad(dict_index_is_clust(clust_index)); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_field_t* field = &clust_index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/** Gets the column position in the given index. +@param[in] col table column +@param[in] index index to be searched for column +@return position of column in the given index. */ +UNIV_INLINE +ulint +dict_col_get_index_pos( + const dict_col_t* col, + const dict_index_t* index) +{ + ulint i; + + for (i = 0; i < index->n_def; i++) { + const dict_field_t* field = &index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes)); +} + +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table)) + ->indexes)); +} + +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index)); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the number of user-defined non-virtual columns in a table in the +dictionary cache. +@return number of user-defined (e.g., not ROW_ID) non-virtual +columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + /* n_cols counts stored columns only. A table may contain + virtual columns and no user-specified stored columns at all. */ + ut_ad(table->n_cols >= DATA_N_SYS_COLS); + return unsigned(table->n_cols) - DATA_N_SYS_COLS; +} + +/********************************************************************//** +Gets the number of all non-virtual columns (also system) in a table +in the dictionary cache. +@return number of non-virtual columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return(table->n_cols); +} + +/** Gets the number of virtual columns in a table in the dictionary cache. +@param[in] table the table to check +@return number of virtual columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_v_cols( + const dict_table_t* table) +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_v_cols); +} + +/** Check if a table has indexed virtual columns +@param[in] table the table to check +@return true is the table has indexed virtual columns */ +UNIV_INLINE +bool +dict_table_has_indexed_v_cols( + const dict_table_t* table) +{ + + for (unsigned i = 0; i < table->n_v_cols; i++) { + const dict_v_col_t* col = dict_table_get_nth_v_col(table, i); + if (col->m_col.ord_part) { + return(true); + } + } + + return(false); +} + +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->stat_initialized); + + return(table->stat_n_rows); +} + +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows < 0xFFFFFFFFFFFFFFFFULL) { + table->stat_n_rows = n_rows + 1; + } + } +} + +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows > 0) { + table->stat_n_rows = n_rows - 1; + } + } +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ +{ + ut_ad(pos < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return((dict_col_t*) (table->cols) + pos); +} + +/** Gets the nth virtual column of a table. +@param[in] table table +@param[in] pos position of virtual column +@return pointer to virtual column object */ +UNIV_INLINE +dict_v_col_t* +dict_table_get_nth_v_col( + const dict_table_t* table, + ulint pos) +{ + ut_ad(table); + ut_ad(pos < table->n_v_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->v_cols[pos].m_col.is_added()); + ut_ad(!table->v_cols[pos].m_col.is_dropped()); + return &table->v_cols[pos]; +} + +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ +{ + dict_col_t* col; + col = dict_table_get_nth_col(table, + dict_table_get_sys_col_no(table, sys)); + ut_ad(col->mtype == DATA_SYS); + ut_ad(col->prtype == (sys | DATA_NOT_NULL)); + + return(col); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +unsigned +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ +{ + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS); +} + +/************************************************************************ +Check if the table has an FTS index. */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + /* out: TRUE if table has an FTS index */ + dict_table_t* table) /* in: table */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)); +} + +/** Validate the flags for tables that are not ROW_FORMAT=REDUNDANT. +@param[in] flags table flags +@return whether the flags are valid */ +inline +bool +dict_tf_is_valid_not_redundant(ulint flags) +{ + const bool atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); + + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); + + if (!zip_ssize) { + /* Not ROW_FORMAT=COMPRESSED */ + } else if (!atomic_blobs) { + /* ROW_FORMAT=COMPRESSED implies ROW_FORMAT=DYNAMIC + for the uncompressed page format */ + return(false); + } else if (zip_ssize > PAGE_ZIP_SSIZE_MAX + || zip_ssize > srv_page_size_shift + || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) { + /* KEY_BLOCK_SIZE is out of bounds, or + ROW_FORMAT=COMPRESSED is not supported with this + innodb_page_size (only up to 16KiB) */ + return(false); + } + + switch (DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)) { + case 0: + /* PAGE_COMPRESSION_LEVEL=0 should imply PAGE_COMPRESSED=NO */ + return(!DICT_TF_GET_PAGE_COMPRESSION(flags)); + case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9: + /* PAGE_COMPRESSION_LEVEL requires + ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC + (not ROW_FORMAT=COMPRESSED or ROW_FORMAT=REDUNDANT) + and PAGE_COMPRESSED=YES */ + return(!zip_ssize && DICT_TF_GET_PAGE_COMPRESSION(flags)); + default: + /* Invalid PAGE_COMPRESSION_LEVEL value */ + return(false); + } +} + +/** Validate the table flags. +@param[in] flags Table flags +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( + ulint flags) +{ + ut_ad(flags < 1U << DICT_TF_BITS); + /* The DATA_DIRECTORY flag can be assigned fully independently + of all other persistent table flags. */ + flags &= ~DICT_TF_MASK_DATA_DIR; + if (!(flags & 1)) { + /* Only ROW_FORMAT=REDUNDANT has 0 in the least significant + bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag + (which we cleared above) can be set. If any other flags + are set, the flags are invalid. */ + return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK); + } + + return(dict_tf_is_valid_not_redundant(flags)); +} + +/** Validate both table flags and table flags2 and make sure they +are compatible. +@param[in] flags Table flags +@param[in] flags2 Table flags2 +@return true if valid. */ +UNIV_INLINE +bool +dict_tf2_is_valid( + ulint flags, + ulint flags2) +{ + if (!dict_tf_is_valid(flags)) { + return(false); + } + + if ((flags2 & DICT_TF2_UNUSED_BIT_MASK) != 0) { + return(false); + } + + return(true); +} + +/********************************************************************//** +Determine the file format from dict_table_t::flags +The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any +other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set. +@return file format version */ +UNIV_INLINE +rec_format_t +dict_tf_get_rec_format( +/*===================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ut_a(dict_tf_is_valid(flags)); + + if (!DICT_TF_GET_COMPACT(flags)) { + return(REC_FORMAT_REDUNDANT); + } + + if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) { + return(REC_FORMAT_COMPACT); + } + + if (DICT_TF_GET_ZIP_SSIZE(flags)) { + return(REC_FORMAT_COMPRESSED); + } + + return(REC_FORMAT_DYNAMIC); +} + +/** Set the various values in a dict_table_t::flags pointer. +@param[in,out] flags, Pointer to a 4 byte Table Flags +@param[in] format File Format +@param[in] zip_ssize Zip Shift Size +@param[in] use_data_dir Table uses DATA DIRECTORY +@param[in] page_compressed Table uses page compression +@param[in] page_compression_level Page compression level */ +UNIV_INLINE +void +dict_tf_set( +/*========*/ + ulint* flags, + rec_format_t format, + ulint zip_ssize, + bool use_data_dir, + bool page_compressed, + ulint page_compression_level) +{ + *flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0; + + switch (format) { + case REC_FORMAT_REDUNDANT: + ut_ad(zip_ssize == 0); + /* no other options are allowed */ + ut_ad(!page_compressed); + return; + case REC_FORMAT_COMPACT: + *flags |= DICT_TF_COMPACT; + ut_ad(zip_ssize == 0); + break; + case REC_FORMAT_COMPRESSED: + *flags |= DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (zip_ssize << DICT_TF_POS_ZIP_SSIZE); + break; + case REC_FORMAT_DYNAMIC: + *flags |= DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS); + ut_ad(zip_ssize == 0); + break; + } + + if (page_compressed) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } +} + +/** Convert a 32 bit integer table flags to the 32 bit FSP Flags. +Fsp Flags are written into the tablespace header at the offset +FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field. +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@param[in] table_flags dict_table_t::flags +@return tablespace flags (fil_space_t::flags) */ +UNIV_INLINE +ulint +dict_tf_to_fsp_flags(ulint table_flags) +{ + ulint fsp_flags; + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL( + table_flags); + + ut_ad((DICT_TF_GET_PAGE_COMPRESSION(table_flags) == 0) + == (page_compression_level == 0)); + + DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", + return(ULINT_UNDEFINED);); + + /* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */ + if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 + || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32) + && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) { + + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE(); + + if (page_compression_level) { + fsp_flags |= innodb_compression_algorithm + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + } + } else { + /* Adjust bit zero. */ + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags + & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS); + + fsp_flags |= FSP_FLAGS_PAGE_SSIZE(); + + if (page_compression_level) { + fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + } + } + + ut_a(fil_space_t::is_valid_flags(fsp_flags, false)); + + if (DICT_TF_HAS_DATA_DIR(table_flags)) { + fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR; + } + + fsp_flags |= page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL; + + return(fsp_flags); +} + +/********************************************************************//** +Convert a 32 bit integer table flags to the 32bit integer that is written +to a SYS_TABLES.TYPE field. The following chart shows the translation of +the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +dict_table_t::flags | 0 | 1 | 1 +SYS_TABLES.TYPE | 1 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_tf_to_sys_tables_type( +/*=======================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ulint type; + + ut_a(dict_tf_is_valid(flags)); + + /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ + type = 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL are the same. */ + type |= flags & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_NO_ROLLBACK); + + return(type); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + return(index->n_fields); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + return(index->n_uniq); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_clust(index)) { + + return(dict_index_get_n_unique(index)); + } + + return(dict_index_get_n_fields(index)); +} + +/** +Gets the number of fields on nonleaf page level in the internal representation +of an index which uniquely determine the position of an index entry in the +index, if we also take multiversioning into account. Note, it doesn't +include page no field. +@param[in] index index +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree_nonleaf( + const dict_index_t* index) +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_spatial(index)) { + /* For spatial index, on non-leaf page, we have only + 2 fields(mbr+page_no). So, except page no field, + there's one field there. */ + return(DICT_INDEX_SPATIAL_NODEPTR_SIZE); + } else { + return(dict_index_get_n_unique_in_tree(index)); + } +} + +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation of clustered indexes we add the row id to the ordering fields +to make a clustered index unique, but this function returns the number of +fields the user defined in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + return(index->n_user_defined_cols); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ +{ + ut_ad(pos < index->n_def); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((dict_field_t*) (index->fields) + pos); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ +{ + return(field->col); +} + +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_field_get_col(dict_index_get_nth_field(index, pos))); +} + +/********************************************************************//** +Gets the column number the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_col_get_no(dict_index_get_nth_col(index, pos))); +} + +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ +{ + return(dict_index_get_nth_col_or_prefix_pos(index, n, false, false, + prefix_col_pos)); +} + +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +unsigned +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ +{ + unsigned n= dict_index_get_n_fields(index); + unsigned size= 0; + + while (n--) + size+= dict_col_get_min_size(dict_index_get_nth_col(index, n)); + + return size; +} + +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +uint32_t +dict_index_get_page( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->page); +} + +/*********************************************************************//** +Gets the read-write lock of the index tree. +@return read-write lock */ +UNIV_INLINE +rw_lock_t* +dict_index_get_lock( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(&(index->lock)); +} + +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void) +/*==============================*/ +{ + return(srv_page_size / 16); +} + +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ +{ + enum online_index_status status; + + status = (enum online_index_status) index->online_status; + + /* Without the index->lock protection, the online + status can change from ONLINE_INDEX_CREATION to + ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in + row_log_apply() once log application is done. So to make + sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE + you should always do the recheck after acquiring index->lock */ + +#ifdef UNIV_DEBUG + switch (status) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + return(status); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(status); +} + +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ +{ + ut_ad(!(index->type & DICT_FTS)); + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)); + +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + break; + case ONLINE_INDEX_ABORTED: + ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED); + break; + case ONLINE_INDEX_ABORTED_DROPPED: + ut_error; + } +#endif /* UNIV_DEBUG */ + + index->online_status = status & 3; + ut_ad(dict_index_get_online_status(index) == status); +} + +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ +{ +#ifdef UNIV_DEBUG + if (dict_index_is_clust(index)) { + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + ut_ad(0); + return(false); + } +#endif /* UNIV_DEBUG */ + + return(UNIV_UNLIKELY(dict_index_get_online_status(index) + != ONLINE_INDEX_COMPLETE)); +} + +/**********************************************************************//** +Check whether a column exists in an FTS index. +@return ULINT_UNDEFINED if no match else the offset within the vector */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + ib_vector_t* indexes,/*!< in: vector containing only FTS indexes */ + ulint col_no, /*!< in: col number to search for */ + bool is_virtual) /*!< in: whether it is a virtual column */ + +{ + ulint i; + + for (i = 0; i < ib_vector_size(indexes); ++i) { + dict_index_t* index; + + index = (dict_index_t*) ib_vector_getp(indexes, i); + + if (index->contains_col_or_prefix(col_no, is_virtual)) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note that if !dict_table_has_atomic_blobs(table), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ +{ + if (!dict_table_has_atomic_blobs(table)) { + return(0); + } + + if (col->max_prefix != 0) { + return(col->max_prefix); + } + + return(REC_VERSION_56_MAX_INDEX_COL_LEN); +} + +/** Determine maximum bytes of a virtual column need to be stored +in the undo log. +@param[in] table dict_table_t for the table +@param[in] col_no virtual column number +@return maximum bytes of virtual column to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_v_field_len_store_undo( + dict_table_t* table, + ulint col_no) +{ + const dict_col_t* col + = &dict_table_get_nth_v_col(table, col_no)->m_col; + ulint max_log_len; + + /* This calculation conforms to the non-virtual column + maximum log length calculation: + 1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN + 2) if atomic BLOB, upto col->max_prefix or + REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */ + if (dict_table_has_atomic_blobs(table)) { + if (DATA_BIG_COL(col) && col->max_prefix > 0) { + max_log_len = col->max_prefix; + } else { + max_log_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table); + } + } else { + max_log_len = REC_ANTELOPE_MAX_INDEX_COL_LEN; + } + + return(max_log_len); +} + +/********************************************************************//** +Check whether the table is corrupted. +@return nonzero for corrupted table, zero for valid tables */ +UNIV_INLINE +ulint +dict_table_is_corrupted( +/*====================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return(table->corrupted); +} + +/** Check if the table is found is a file_per_table tablespace. +This test does not use table flags2 since some REDUNDANT tables in the +system tablespace may have garbage in the MIX_LEN field where flags2 is +stored. These garbage MIX_LEN fields were written before v3.23.52. +A patch was added to v3.23.52 which initializes the MIX_LEN field to 0. +Since file-per-table tablespaces were added in 4.1, any SYS_TABLES +record with a non-zero space ID will have a reliable MIX_LEN field. +However, this test does not use flags2 from SYS_TABLES.MIX_LEN. Instead, +assume that if the tablespace is not a predefined system tablespace, + then it must be file-per-table. +Also, during ALTER TABLE, the DICT_TF2_USE_FILE_PER_TABLE flag may not be +set on one of the file-per-table tablespaces. +This test cannot be done on a table in the process of being created +because the space_id will be zero until the tablespace is created. +@param[in] table An existing open table to check +@return true if this table was created as a file-per-table tablespace. */ +UNIV_INLINE +bool +dict_table_is_file_per_table( + const dict_table_t* table) /*!< in: table to check */ +{ + return table->space != fil_system.sys_space + && table->space != fil_system.temp_space; +} + +/** Acquire the table handle. */ +inline +void +dict_table_t::acquire() +{ + ut_ad(mutex_own(&dict_sys.mutex)); + n_ref_count++; +} + +/** Release the table handle. +@return whether the last handle was released */ +inline +bool +dict_table_t::release() +{ + auto n = n_ref_count--; + ut_ad(n > 0); + return n == 1; +} + +/** Encode the number of columns and number of virtual columns in a +4 bytes value. We could do this because the number of columns in +InnoDB is limited to 1017 +@param[in] n_col number of non-virtual column +@param[in] n_v_col number of virtual column +@return encoded value */ +UNIV_INLINE +ulint +dict_table_encode_n_col( + ulint n_col, + ulint n_v_col) +{ + return(n_col + (n_v_col<<16)); +} + +/** decode number of virtual and non-virtual columns in one 4 bytes value. +@param[in] encoded encoded value +@param[in,out] n_col number of non-virtual column +@param[in,out] n_v_col number of virtual column */ +UNIV_INLINE +void +dict_table_decode_n_col( + ulint encoded, + ulint* n_col, + ulint* n_v_col) +{ + + ulint num = encoded & ~DICT_N_COLS_COMPACT; + *n_v_col = num >> 16; + *n_col = num & 0xFFFF; +} + +/** Free the virtual column template +@param[in,out] vc_templ virtual column template */ +void +dict_free_vc_templ( + dict_vcol_templ_t* vc_templ) +{ + UT_DELETE_ARRAY(vc_templ->default_rec); + vc_templ->default_rec = NULL; + + if (vc_templ->vtempl != NULL) { + ut_ad(vc_templ->n_v_col > 0); + for (ulint i = 0; i < vc_templ->n_col + + vc_templ->n_v_col; i++) { + if (vc_templ->vtempl[i] != NULL) { + ut_free(vc_templ->vtempl[i]); + } + } + ut_free(vc_templ->vtempl); + vc_templ->vtempl = NULL; + } +} + +/** Check whether the table have virtual index. +@param[in] table InnoDB table +@return true if the table have virtual index, false otherwise. */ +UNIV_INLINE +bool +dict_table_have_virtual_index( + dict_table_t* table) +{ + for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table); + col_no++) { + const dict_v_col_t* col + = dict_table_get_nth_v_col(table, col_no); + + if (col->m_col.ord_part) { + return(true); + } + } + + return(false); +} diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h new file mode 100644 index 00000000..f067571c --- /dev/null +++ b/storage/innobase/include/dict0load.h @@ -0,0 +1,309 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0load.h +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0load_h +#define dict0load_h + +#include "dict0types.h" +#include "trx0types.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "btr0types.h" + +#include <deque> + +/** A stack of table names related through foreign key constraints */ +typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t; + +/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */ +enum dict_system_id_t { + SYS_TABLES = 0, + SYS_INDEXES, + SYS_COLUMNS, + SYS_FIELDS, + SYS_FOREIGN, + SYS_FOREIGN_COLS, + SYS_TABLESPACES, + SYS_DATAFILES, + SYS_VIRTUAL, + + /* This must be last item. Defines the number of system tables. */ + SYS_NUM_SYSTEM_TABLES +}; + +/** Check each tablespace found in the data dictionary. +Look at each table defined in SYS_TABLES that has a space_id > 0. +If the tablespace is not yet in the fil_system cache, look up the +tablespace in SYS_DATAFILES to ensure the correct path. + +In a crash recovery we already have some tablespace objects created from +processing the REDO log. Any other tablespace in SYS_TABLESPACES not +previously used in recovery will be opened here. We will compare the +space_id information in the data dictionary to what we find in the +tablespace file. In addition, more validation will be done if recovery +was needed and force_recovery is not set. + +We also scan the biggest space id, and store it to fil_system. */ +void dict_check_tablespaces_and_store_max_id(); + +/********************************************************************//** +Finds the first table name in the given database. +@return own: table name, NULL if does not exist; the caller must free +the memory in the string! */ +char* +dict_get_first_table_name_in_db( +/*============================*/ + const char* name); /*!< in: database name which ends to '/' */ + +/** Make sure the data_file_name is saved in dict_table_t if needed. +Try to read it from the fil_system first, then from SYS_DATAFILES. +@param[in] table Table object +@param[in] dict_mutex_own true if dict_sys.mutex is owned already */ +void +dict_get_and_save_data_dir_path( + dict_table_t* table, + bool dict_mutex_own); + +/** Loads a table definition and also all its index definitions, and also +the cluster definition if the table is a member in a cluster. Also loads +all foreign key constraints where the foreign key is in the table or where +a foreign key references columns in this table. +@param[in] name Table name in the dbname/tablename format +@param[in] ignore_err Error to be ignored when loading + table and its index definition +@return table, NULL if does not exist; if the table is stored in an +.ibd file, but the file does not exist, then we set the file_unreadable +flag in the table object we return. */ +dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err); + +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err); /*!< in: errors to ignore + when loading the table */ +/********************************************************************//** +This function is called when the database is booted. +Loads system table index definitions except for the clustered index which +is added to the dictionary cache at booting before calling this function. */ +void +dict_load_sys_table( +/*================*/ + dict_table_t* table); /*!< in: system table */ +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. + +The foreign key constraint is loaded only if the referenced table is also +in the dictionary cache. If the referenced table is not in dictionary +cache, then it is added to the output parameter (fk_tables). + +@return DB_SUCCESS or error code */ +dberr_t +dict_load_foreigns( +/*===============*/ + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + bool check_recursive,/*!< in: Whether to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err, /*!< in: error to be ignored */ + dict_names_t& fk_tables) /*!< out: stack of table names + which must be loaded + subsequently to load all the + foreign key constraints. */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/********************************************************************//** +This function opens a system table, and return the first record. +@return first record of the system table */ +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_system_id_t system_id); /*!< in: which system table to open */ +/********************************************************************//** +This function get the next system table record as we scan the table. +@return the record if found, NULL if end of scan. */ +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr); /*!< in: the mini-transaction */ +/********************************************************************//** +This function processes one SYS_TABLES record and populate the dict_table_t +struct for the table. +@return error message, or NULL on success */ +const char* +dict_process_sys_tables_rec_and_mtr_commit( +/*=======================================*/ + mem_heap_t* heap, /*!< in: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table, /*!< out: dict_table_t to fill */ + bool cached, /*!< in: whether to load from cache */ + mtr_t* mtr); /*!< in/out: mini-transaction, + will be committed */ +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: dict_index_t to be + filled */ + table_id_t* table_id); /*!< out: table id */ +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + ulint* nth_v_col); /*!< out: if virtual col, this is + records its sequence number */ + +/** This function parses a SYS_VIRTUAL record and extract virtual column +information +@param[in,out] heap heap memory +@param[in] rec current SYS_COLUMNS rec +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@return error message, or NULL on success */ +const char* +dict_process_sys_virtual_rec( + const rec_t* rec, + table_id_t* table_id, + ulint* pos, + ulint* base_pos); +/********************************************************************//** +This function parses a SYS_FIELDS record and populate a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id); /*!< in: previous index id */ +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign); /*!< out: dict_foreign_t to be + filled */ +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos); /*!< out: column position */ +/********************************************************************//** +This function parses a SYS_TABLESPACES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +const char* +dict_process_sys_tablespaces( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */ + uint32_t* space, /*!< out: tablespace identifier */ + const char** name, /*!< out: tablespace name */ + ulint* flags); /*!< out: tablespace flags */ +/********************************************************************//** +This function parses a SYS_DATAFILES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +const char* +dict_process_sys_datafiles( +/*=======================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_DATAFILES rec */ + uint32_t* space, /*!< out: tablespace identifier */ + const char** path); /*!< out: datafile path */ + +/** Update the record for space_id in SYS_TABLESPACES to this filepath. +@param[in] space_id Tablespace ID +@param[in] filepath Tablespace filepath +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +dberr_t +dict_update_filepath( + ulint space_id, + const char* filepath); + +/** Replace records in SYS_TABLESPACES and SYS_DATAFILES associated with +the given space_id using an independent transaction. +@param[in] space_id Tablespace ID +@param[in] name Tablespace name +@param[in] filepath First filepath +@param[in] fsp_flags Tablespace flags +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +dberr_t +dict_replace_tablespace_and_filepath( + ulint space_id, + const char* name, + const char* filepath, + ulint fsp_flags); + +#endif diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h new file mode 100644 index 00000000..9d7dcf47 --- /dev/null +++ b/storage/innobase/include/dict0mem.h @@ -0,0 +1,2542 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0mem.h +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0mem_h +#define dict0mem_h + +#include "data0type.h" +#include "mem0mem.h" +#include "row0types.h" +#include "rem0types.h" +#include "btr0types.h" +#include "lock0types.h" +#include "que0types.h" +#include "sync0rw.h" +#include "ut0mem.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "fts0fts.h" +#include "buf0buf.h" +#include "gis0type.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "mysql_com.h" +#include <sql_const.h> +#include <set> +#include <algorithm> +#include <iterator> +#include <ostream> +#include <mutex> + +/* Forward declaration. */ +struct ib_rbt_t; + +/** Type flags of an index: OR'ing of the flags is allowed to define a +combination of types */ +/* @{ */ +#define DICT_CLUSTERED 1 /*!< clustered index; for other than + auto-generated clustered indexes, + also DICT_UNIQUE will be set */ +#define DICT_UNIQUE 2 /*!< unique index */ +#define DICT_IBUF 8 /*!< insert buffer tree */ +#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag + in SYS_INDEXES.TYPE */ +#define DICT_FTS 32 /* FTS index; can't be combined with the + other flags */ +#define DICT_SPATIAL 64 /* SPATIAL index; can't be combined with the + other flags */ +#define DICT_VIRTUAL 128 /* Index on Virtual column */ + +#define DICT_IT_BITS 8 /*!< number of bits used for + SYS_INDEXES.TYPE */ +/* @} */ + +#if 0 /* not implemented, retained for history */ +/** Types for a table object */ +#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */ +#define DICT_TABLE_CLUSTER_MEMBER 2 +#define DICT_TABLE_CLUSTER 3 /* this means that the table is + really a cluster definition */ +#endif + +/* Table and tablespace flags are generally not used for the Antelope file +format except for the low order bit, which is used differently depending on +where the flags are stored. + +==================== Low order flags bit ========================= + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +FSP_SPACE_FLAGS | 0 | 0 | 1 +fil_space_t::flags | 0 | 0 | 1 + +Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1) +and the tablespace flags field was always 0. In the 5.1 plugin, these fields +were repurposed to identify compressed and dynamic row formats. + +The following types and constants describe the flags found in dict_table_t +and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS +are described in fsp0fsp.h. */ + +/* @{ */ +/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */ +#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */ +/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */ +#define DICT_TF_COMPACT 1U /*!< Compact row format. */ + +/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether +the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */ +#define DICT_N_COLS_COMPACT 0x80000000UL + +/** Width of the COMPACT flag */ +#define DICT_TF_WIDTH_COMPACT 1 + +/** Width of the ZIP_SSIZE flag */ +#define DICT_TF_WIDTH_ZIP_SSIZE 4 + +/** Width of the ATOMIC_BLOBS flag. The ROW_FORMAT=REDUNDANT and +ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes +in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED +store the whole blob or text field off-page atomically. +Secondary indexes are created from this external data using row_ext_t +to cache the BLOB prefixes. */ +#define DICT_TF_WIDTH_ATOMIC_BLOBS 1 + +/** If a table is created with the MYSQL option DATA DIRECTORY and +innodb-file-per-table, an older engine will not be able to find that table. +This flag prevents older engines from attempting to open the table and +allows InnoDB to update_create_info() accordingly. */ +#define DICT_TF_WIDTH_DATA_DIR 1 + +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for +ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3) +*/ +#define DICT_TF_WIDTH_NO_ROLLBACK 2 + +/** Width of all the currently known table flags */ +#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + + DICT_TF_WIDTH_ZIP_SSIZE \ + + DICT_TF_WIDTH_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_NO_ROLLBACK) + +/** Zero relative shift position of the COMPACT field */ +#define DICT_TF_POS_COMPACT 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \ + + DICT_TF_WIDTH_COMPACT) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \ + + DICT_TF_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the DATA_DIR field */ +#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the NO_ROLLBACK field */ +#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \ + + DICT_TF_WIDTH_NO_ROLLBACK) + +/** Bit mask of the COMPACT field */ +#define DICT_TF_MASK_COMPACT \ + ((~(~0U << DICT_TF_WIDTH_COMPACT)) \ + << DICT_TF_POS_COMPACT) +/** Bit mask of the ZIP_SSIZE field */ +#define DICT_TF_MASK_ZIP_SSIZE \ + ((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE)) \ + << DICT_TF_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define DICT_TF_MASK_ATOMIC_BLOBS \ + ((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS)) \ + << DICT_TF_POS_ATOMIC_BLOBS) +/** Bit mask of the DATA_DIR field */ +#define DICT_TF_MASK_DATA_DIR \ + ((~(~0U << DICT_TF_WIDTH_DATA_DIR)) \ + << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the NO_ROLLBACK field */ +#define DICT_TF_MASK_NO_ROLLBACK \ + ((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \ + << DICT_TF_POS_NO_ROLLBACK) + +/** Return the value of the COMPACT field */ +#define DICT_TF_GET_COMPACT(flags) \ + ((flags & DICT_TF_MASK_COMPACT) \ + >> DICT_TF_POS_COMPACT) +/** Return the value of the ZIP_SSIZE field */ +#define DICT_TF_GET_ZIP_SSIZE(flags) \ + ((flags & DICT_TF_MASK_ZIP_SSIZE) \ + >> DICT_TF_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \ + >> DICT_TF_POS_ATOMIC_BLOBS) +/** Return the value of the DATA_DIR field */ +#define DICT_TF_HAS_DATA_DIR(flags) \ + ((flags & DICT_TF_MASK_DATA_DIR) \ + >> DICT_TF_POS_DATA_DIR) +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) + +/* @} */ + +/** @brief Table Flags set number 2. + +These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags +will be written as 0. The column may contain garbage for tables +created with old versions of InnoDB that only implemented +ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags +for unknown bits in order to protect backward incompatibility. */ +/* @{ */ +/** Total number of bits in table->flags2. */ +#define DICT_TF2_BITS 7 +#define DICT_TF2_UNUSED_BIT_MASK (~0U << DICT_TF2_BITS) +#define DICT_TF2_BIT_MASK ~DICT_TF2_UNUSED_BIT_MASK + +/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */ +#define DICT_TF2_TEMPORARY 1U + +/** The table has an internal defined DOC ID column */ +#define DICT_TF2_FTS_HAS_DOC_ID 2U + +/** The table has an FTS index */ +#define DICT_TF2_FTS 4U + +/** Need to add Doc ID column for FTS index build. +This is a transient bit for index build */ +#define DICT_TF2_FTS_ADD_DOC_ID 8U + +/** This bit is used during table creation to indicate that it will +use its own tablespace instead of the system tablespace. */ +#define DICT_TF2_USE_FILE_PER_TABLE 16U + +/** Set when we discard/detach the tablespace */ +#define DICT_TF2_DISCARDED 32U + +/** This bit is set if all aux table names (both common tables and +index tables) of a FTS table are in HEX format. */ +#define DICT_TF2_FTS_AUX_HEX_NAME 64U + +/* @} */ + +#define DICT_TF2_FLAG_SET(table, flag) \ + (table->flags2 |= (flag)) + +#define DICT_TF2_FLAG_IS_SET(table, flag) \ + (table->flags2 & (flag)) + +#define DICT_TF2_FLAG_UNSET(table, flag) \ + (table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1)) + +/** Tables could be chained together with Foreign key constraint. When +first load the parent table, we would load all of its descedents. +This could result in rescursive calls and out of stack error eventually. +DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads, +when exceeded, the child table will not be loaded. It will be loaded when +the foreign constraint check needs to be run. */ +#define DICT_FK_MAX_RECURSIVE_LOAD 20 + +/** Similarly, when tables are chained together with foreign key constraints +with on cascading delete/update clause, delete from parent table could +result in recursive cascading calls. This defines the maximum number of +such cascading deletes/updates allowed. When exceeded, the delete from +parent table will fail, and user has to drop excessive foreign constraint +before proceeds. */ +#define FK_MAX_CASCADE_DEL 15 + +/** Create a table memory object. +@param name table name +@param space tablespace +@param n_cols total number of columns (both virtual and non-virtual) +@param n_v_cols number of virtual columns +@param flags table flags +@param flags2 table flags2 +@return own: table object */ +dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space, + ulint n_cols, ulint n_v_cols, ulint flags, + ulint flags2); +/****************************************************************/ /** + Free a table memory object. */ +void +dict_mem_table_free( +/*================*/ + dict_table_t* table); /*!< in: table */ +/**********************************************************************//** +Adds a column definition to a table. */ +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ + MY_ATTRIBUTE((nonnull(1))); +/** Adds a virtual column definition to a table. +@param[in,out] table table +@param[in] heap temporary memory heap, or NULL. It is + used to store name when we have not finished + adding all columns. When all columns are + added, the whole name will copy to memory from + table->heap +@param[in] name column name +@param[in] mtype main datatype +@param[in] prtype precise type +@param[in] len length +@param[in] pos position in a table +@param[in] num_base number of base columns +@return the virtual column definition */ +dict_v_col_t* +dict_mem_table_add_v_col( + dict_table_t* table, + mem_heap_t* heap, + const char* name, + ulint mtype, + ulint prtype, + ulint len, + ulint pos, + ulint num_base); + +/** Adds a stored column definition to a table. +@param[in] table table +@param[in] num_base number of base columns. */ +void +dict_mem_table_add_s_col( + dict_table_t* table, + ulint num_base); + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ulint nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to, /*!< in: new column name */ + bool is_virtual); + /*!< in: if this is a virtual column */ +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len); /*!< in: column length */ +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +dict_index_t* +dict_mem_index_create( +/*==================*/ + dict_table_t* table, /*!< in: table */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Adds a field definition to an index. NOTE: does not take a copy +of the column name if the field is a column. The memory occupied +by the column name may be released only after publishing the index. */ +void +dict_mem_index_add_field( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + const char* name, /*!< in: column name */ + ulint prefix_len); /*!< in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ +/**********************************************************************//** +Frees an index memory object. */ +void +dict_mem_index_free( +/*================*/ + dict_index_t* index); /*!< in: index */ +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +dict_foreign_t* +dict_mem_foreign_create(void); +/*=========================*/ + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/** Fills the dependent virtual columns in a set. +Reason for being dependent are +1) FK can be present on base column of virtual columns +2) FK can be present on column which is a part of virtual index +@param[in,out] foreign foreign key information. */ +void +dict_mem_foreign_fill_vcol_set( + dict_foreign_t* foreign); + +/** Fill virtual columns set in each fk constraint present in the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_fill_foreign_vcol_set( + dict_table_t* table); + +/** Free the vcol_set from all foreign key constraint on the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_free_foreign_vcol_set( + dict_table_t* table); + +/** Create a temporary tablename like "#sql-ibNNN". +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id); + +/** SQL identifier name wrapper for pretty-printing */ +class id_name_t +{ +public: + /** Default constructor */ + id_name_t() + : m_name() + {} + /** Constructor + @param[in] name identifier to assign */ + explicit id_name_t( + const char* name) + : m_name(name) + {} + + /** Assignment operator + @param[in] name identifier to assign */ + id_name_t& operator=( + const char* name) + { + m_name = name; + return(*this); + } + + /** Implicit type conversion + @return the name */ + operator const char*() const + { + return(m_name); + } + + /** Explicit type conversion + @return the name */ + const char* operator()() const + { + return(m_name); + } + +private: + /** The name in internal representation */ + const char* m_name; +}; + +/** Data structure for a column in a table */ +struct dict_col_t{ + /*----------------------*/ + /** The following are copied from dtype_t, + so that all bit-fields can be packed tightly. */ + /* @{ */ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + + unsigned mbminlen:3; /*!< minimum length of a + character, in bytes */ + unsigned mbmaxlen:3; /*!< maximum length of a + character, in bytes */ + /*----------------------*/ + /* End of definitions copied from dtype_t */ + /* @} */ + + unsigned ind:10; /*!< table column position + (starting from 0) */ + unsigned ord_part:1; /*!< nonzero if this column + appears in the ordering fields + of an index */ + unsigned max_prefix:12; /*!< maximum index prefix length on + this column. Our current max limit is + 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN) + bytes. */ +private: + /** Special value of ind for a dropped column */ + static const unsigned DROPPED = 1023; +public: + + /** Detach a virtual column from an index. + @param index being-freed index */ + inline void detach(const dict_index_t &index); + + /** Data for instantly added columns */ + struct def_t + { + /** original default value of instantly added column */ + const void *data; + /** len of data, or UNIV_SQL_DEFAULT if unavailable */ + ulint len; + } def_val; + + /** Retrieve the column name. + @param table the table of this column */ + const char *name(const dict_table_t &table) const; + + /** @return whether this is a virtual column */ + bool is_virtual() const { return prtype & DATA_VIRTUAL; } + /** @return whether NULL is an allowed value for this column */ + bool is_nullable() const { return !(prtype & DATA_NOT_NULL); } + + /** @return whether table of this system field is TRX_ID-based */ + bool vers_native() const + { + ut_ad(vers_sys_start() || vers_sys_end()); + ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY); + return mtype == DATA_INT; + } + /** @return whether this user column (not row_start, row_end) + has System Versioning property */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system version start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system version end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } + + /** @return whether this is an instantly-added column */ + bool is_added() const + { + DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data); + return def_val.len != UNIV_SQL_DEFAULT; + } + /** Flag the column instantly dropped */ + void set_dropped() { ind = DROPPED; } + /** Flag the column instantly dropped. + @param not_null whether the column was NOT NULL + @param len2 whether the length exceeds 255 bytes + @param fixed_len the fixed length in bytes, or 0 */ + void set_dropped(bool not_null, bool len2, unsigned fixed) + { + DBUG_ASSERT(!len2 || !fixed); + prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE; + if (fixed) + { + mtype= DATA_FIXBINARY; + len= static_cast<uint16_t>(fixed); + } + else + { + mtype= DATA_BINARY; + len= len2 ? 65535 : 255; + } + mbminlen= mbmaxlen= 0; + ind= DROPPED; + ord_part= 0; + max_prefix= 0; + } + /** @return whether the column was instantly dropped */ + bool is_dropped() const { return ind == DROPPED; } + /** @return whether the column was instantly dropped + @param index the clustered index */ + inline bool is_dropped(const dict_index_t &index) const; + + /** Get the default value of an instantly-added column. + @param[out] len value length (in bytes), or UNIV_SQL_NULL + @return default value + @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ + const byte *instant_value(ulint *len) const + { + DBUG_ASSERT(is_added()); + *len= def_val.len; + return static_cast<const byte*>(def_val.data); + } + + /** Remove the 'instant ADD' status of the column */ + void clear_instant() + { + def_val.len= UNIV_SQL_DEFAULT; + def_val.data= NULL; + } + + /** @return whether two columns have compatible data type encoding */ + bool same_type(const dict_col_t &other) const + { + if (mtype != other.mtype) + { + /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR + will be used instead of DATA_MYSQL and DATA_VARMYSQL. + As long as mtype,prtype are being written to InnoDB + data dictionary tables, we cannot simplify this. */ + switch (mtype) { + default: + return false; + case DATA_VARCHAR: + if (other.mtype != DATA_VARMYSQL) + return false; + goto check_encoding; + case DATA_VARMYSQL: + if (other.mtype != DATA_VARCHAR) + return false; + goto check_encoding; + case DATA_CHAR: + if (other.mtype != DATA_MYSQL) + return false; + goto check_encoding; + case DATA_MYSQL: + if (other.mtype != DATA_CHAR) + return false; + goto check_encoding; + } + } + else if (dtype_is_string_type(mtype)) + { + check_encoding: + const uint16_t cset= dtype_get_charset_coll(prtype); + const uint16_t ocset= dtype_get_charset_coll(other.prtype); + return cset == ocset || dict_col_t::same_encoding(cset, ocset); + } + + return true; + } + + /** @return whether two collations codes have the same character encoding */ + static bool same_encoding(uint16_t a, uint16_t b); + + /** Determine if the columns have the same format + except for is_nullable() and is_versioned(). + @param other column to compare to + @return whether the columns have the same format */ + bool same_format(const dict_col_t &other) const + { + return same_type(other) && len >= other.len && + mbminlen == other.mbminlen && mbmaxlen >= other.mbmaxlen && + !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED | + CHAR_COLL_MASK << 16 | + DATA_LONG_TRUE_VARCHAR)); + } + + /** @return whether the column values are comparable by memcmp() */ + bool is_binary() const { return prtype & DATA_BINARY_TYPE; } +}; + +/** Index information put in a list of virtual column structure. Index +id and virtual column position in the index will be logged. +There can be multiple entries for a given index, with a different position. */ +struct dict_v_idx_t { + /** active index on the column */ + dict_index_t* index; + + /** position in this index */ + ulint nth_field; + + dict_v_idx_t(dict_index_t* index, ulint nth_field) + : index(index), nth_field(nth_field) {} +}; + +/** Data structure for a virtual column in a table */ +struct dict_v_col_t{ + /** column structure */ + dict_col_t m_col; + + /** array of base column ptr */ + dict_col_t** base_col; + + /** number of base column */ + unsigned num_base:10; + + /** column pos in table */ + unsigned v_pos:10; + + /** Virtual index list, and column position in the index */ + std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> > + v_indexes; + + /** Detach the column from an index. + @param index index to be detached from */ + void detach(const dict_index_t &index) + { + if (v_indexes.empty()) return; + auto i= v_indexes.before_begin(); + do { + auto prev = i++; + if (i == v_indexes.end()) + { + return; + } + if (i->index == &index) + { + v_indexes.erase_after(prev); + return; + } + } + while (i != v_indexes.end()); + } +}; + +/** Data structure for newly added virtual column in a index. +It is used only during rollback_inplace_alter_table() of +addition of index depending on newly added virtual columns +and uses index heap. Should be freed when index is being +removed from cache. */ +struct dict_add_v_col_info +{ + ulint n_v_col; + dict_v_col_t *v_col; + + /** Add the newly added virtual column while rollbacking + the index which contains new virtual columns + @param col virtual column to be duplicated + @param offset offset where to duplicate virtual column */ + dict_v_col_t* add_drop_v_col(mem_heap_t *heap, dict_v_col_t *col, + ulint offset) + { + ut_ad(n_v_col); + ut_ad(offset < n_v_col); + if (!v_col) + v_col= static_cast<dict_v_col_t*> + (mem_heap_alloc(heap, n_v_col * sizeof *v_col)); + new (&v_col[offset]) dict_v_col_t(); + v_col[offset].m_col= col->m_col; + v_col[offset].v_pos= col->v_pos; + return &v_col[offset]; + } +}; + +/** Data structure for newly added virtual column in a table */ +struct dict_add_v_col_t{ + /** number of new virtual column */ + ulint n_v_col; + + /** column structures */ + const dict_v_col_t* v_col; + + /** new col names */ + const char** v_col_name; +}; + +/** Data structure for a stored column in a table. */ +struct dict_s_col_t { + /** Stored column ptr */ + dict_col_t* m_col; + /** array of base col ptr */ + dict_col_t** base_col; + /** number of base columns */ + ulint num_base; + /** column pos in table */ + ulint s_pos; +}; + +/** list to put stored column for create_table_info_t */ +typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> > +dict_s_col_list; + +/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and +is the maximum indexed column length (or indexed prefix length) in +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format, +any fixed-length field that is longer than this will be encoded as +a variable-length field. + +It is set to 3*256, so that one can create a column prefix index on +256 characters of a TEXT or VARCHAR column also in the UTF-8 +charset. In that charset, a character may take at most 3 bytes. This +constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN + +/** Find out maximum indexed column length by its table format. +For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum +field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For +ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could +be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ +#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \ + (dict_table_has_atomic_blobs(table) \ + ? REC_VERSION_56_MAX_INDEX_COL_LEN \ + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) + +#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \ + (DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ? REC_VERSION_56_MAX_INDEX_COL_LEN \ + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) + +/** Defines the maximum fixed length column size */ +#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN + +#ifdef WITH_WSREP +#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500 +#endif /* WITH_WSREP */ + +/** Data structure for a field in an index */ +struct dict_field_t{ + dict_col_t* col; /*!< pointer to the table column */ + id_name_t name; /*!< name of the column */ + unsigned prefix_len:12; /*!< 0 or the length of the column + prefix in bytes in a MySQL index of + type, e.g., INDEX (textcol(25)); + must be smaller than + DICT_MAX_FIELD_LEN_BY_FORMAT; + NOTE that in the UTF-8 charset, MySQL + sets this to (mbmaxlen * the prefix len) + in UTF-8 chars */ + unsigned fixed_len:10; /*!< 0 or the fixed length of the + column if smaller than + DICT_ANTELOPE_MAX_INDEX_COL_LEN */ + + /** Zero-initialize all fields */ + dict_field_t() : col(NULL), name(NULL), prefix_len(0), fixed_len(0) {} + + /** Check whether two index fields are equivalent. + @param[in] old the other index field + @return whether the index fields are equivalent */ + bool same(const dict_field_t& other) const + { + return(prefix_len == other.prefix_len + && fixed_len == other.fixed_len); + } +}; + +/**********************************************************************//** +PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID +COMPRESSION FAILURES +(Note: this is relevant only for compressed indexes) +GOAL: Avoid compression failures by maintaining information about the +compressibility of data. If data is not very compressible then leave +some extra space 'padding' in the uncompressed page making it more +likely that compression of less than fully packed uncompressed page will +succeed. + +This padding heuristic works by increasing the pad linearly until the +desired failure rate is reached. A "round" is a fixed number of +compression operations. +After each round, the compression failure rate for that round is +computed. If the failure rate is too high, then padding is incremented +by a fixed value, otherwise it's left intact. +If the compression failure is lower than the desired rate for a fixed +number of consecutive rounds, then the padding is decreased by a fixed +value. This is done to prevent overshooting the padding value, +and to accommodate the possible change in data compressibility. */ + +/** Number of zip ops in one round. */ +#define ZIP_PAD_ROUND_LEN (128) + +/** Number of successful rounds after which the padding is decreased */ +#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5) + +/** Amount by which padding is increased. */ +#define ZIP_PAD_INCR (128) + +/** Percentage of compression failures that are allowed in a single +round */ +extern ulong zip_failure_threshold_pct; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +extern ulong zip_pad_max; + +/** Data structure to hold information about about how much space in +an uncompressed page should be left as padding to avoid compression +failures. This estimate is based on a self-adapting heuristic. */ +struct zip_pad_info_t { + /** Dummy assignment operator for dict_index_t::clone() */ + zip_pad_info_t &operator=(const zip_pad_info_t&) { return *this; } + std::mutex mutex; /*!< mutex protecting the info */ + Atomic_relaxed<ulint> + pad; /*!< number of bytes used as pad */ + ulint success;/*!< successful compression ops during + current round */ + ulint failure;/*!< failed compression ops during + current round */ + ulint n_rounds;/*!< number of currently successful + rounds */ +}; + +/** Number of samples of data size kept when page compression fails for +a certain index.*/ +#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10 + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; + +/** Data structure for an index. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_index_create(). */ +struct dict_index_t { + /** Maximum number of fields */ + static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1; + + index_id_t id; /*!< id of the index */ + mem_heap_t* heap; /*!< memory heap */ + id_name_t name; /*!< index name */ + dict_table_t* table; /*!< back pointer to table */ + /** root page number, or FIL_NULL if the index has been detached + from storage (DISCARD TABLESPACE or similar), + or 1 if the index is in table->freed_indexes */ + unsigned page:32; + unsigned merge_threshold:6; + /*!< In the pessimistic delete, if the page + data size drops below this limit in percent, + merging it to a neighbor is tried */ +# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50 + unsigned type:DICT_IT_BITS; + /*!< index type (DICT_CLUSTERED, DICT_UNIQUE, + DICT_IBUF, DICT_CORRUPT) */ +#define MAX_KEY_LENGTH_BITS 12 + unsigned trx_id_offset:MAX_KEY_LENGTH_BITS; + /*!< position of the trx id column + in a clustered index record, if the fields + before it are known to be of a fixed size, + 0 otherwise */ +#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH +# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH +#endif + unsigned n_user_defined_cols:10; + /*!< number of columns the user defined to + be in the index: in the internal + representation we add more columns */ + unsigned nulls_equal:1; + /*!< if true, SQL NULL == SQL NULL */ +#ifdef BTR_CUR_HASH_ADAPT +#ifdef MYSQL_INDEX_DISABLE_AHI + unsigned disable_ahi:1; + /*!< whether to disable the + adaptive hash index. + Maybe this could be disabled for + temporary tables? */ +#endif +#endif /* BTR_CUR_HASH_ADAPT */ + unsigned n_uniq:10;/*!< number of fields from the beginning + which are enough to determine an index + entry uniquely */ + unsigned n_def:10;/*!< number of fields defined so far */ + unsigned n_fields:10;/*!< number of fields in the index */ + unsigned n_nullable:10;/*!< number of nullable fields */ + unsigned n_core_fields:10;/*!< number of fields in the index + (before the first time of instant add columns) */ + /** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer + records; usually equal to UT_BITS_IN_BYTES(n_nullable), but + can be less in clustered indexes with instant ADD COLUMN */ + unsigned n_core_null_bytes:8; + /** magic value signalling that n_core_null_bytes was not + initialized yet */ + static const unsigned NO_CORE_NULL_BYTES = 0xff; + /** The clustered index ID of the hard-coded SYS_INDEXES table. */ + static const unsigned DICT_INDEXES_ID = 3; + unsigned cached:1;/*!< TRUE if the index object is in the + dictionary cache */ + unsigned to_be_dropped:1; + /*!< TRUE if the index is to be dropped; + protected by dict_sys.latch */ + unsigned online_status:2; + /*!< enum online_index_status. + Transitions from ONLINE_INDEX_COMPLETE (to + ONLINE_INDEX_CREATION) are protected + by dict_sys.latch and + dict_sys.mutex. Other changes are + protected by index->lock. */ + unsigned uncommitted:1; + /*!< a flag that is set for secondary indexes + that have not been committed to the + data dictionary yet */ + +#ifdef UNIV_DEBUG + /** whether this is a dummy index object */ + bool is_dummy; + /** whether btr_cur_instant_init() is in progress */ + bool in_instant_init; + uint32_t magic_n;/*!< magic number */ +/** Value of dict_index_t::magic_n */ +# define DICT_INDEX_MAGIC_N 76789786 +#endif + dict_field_t* fields; /*!< array of field descriptions */ + st_mysql_ftparser* + parser; /*!< fulltext parser plugin */ + + /** It just indicates whether newly added virtual column + during alter. It stores column in case of alter failure. + It should use heap from dict_index_t. It should be freed + while removing the index from table. */ + dict_add_v_col_info* new_vcol_info; + UT_LIST_NODE_T(dict_index_t) + indexes;/*!< list of indexes of the table */ +#ifdef BTR_CUR_ADAPT + btr_search_t* search_info; + /*!< info used in optimistic searches */ +#endif /* BTR_CUR_ADAPT */ + row_log_t* online_log; + /*!< the log of modifications + during online index creation; + valid when online_status is + ONLINE_INDEX_CREATION */ + /*----------------------*/ + /** Statistics for query optimization */ + /* @{ */ + ib_uint64_t* stat_n_diff_key_vals; + /*!< approximate number of different + key values for this index, for each + n-column prefix where 1 <= n <= + dict_get_n_unique(index) (the array is + indexed from 0 to n_uniq-1); we + periodically calculate new + estimates */ + ib_uint64_t* stat_n_sample_sizes; + /*!< number of pages that were sampled + to calculate each of stat_n_diff_key_vals[], + e.g. stat_n_sample_sizes[3] pages were sampled + to get the number stat_n_diff_key_vals[3]. */ + ib_uint64_t* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + 1 <= n <= dict_get_n_unique(index) (the array + is indexed from 0 to n_uniq-1); This + is used when innodb_stats_method is + "nulls_ignored". */ + ulint stat_index_size; + /*!< approximate index size in + database pages */ + ulint stat_n_leaf_pages; + /*!< approximate number of leaf pages in the + index tree */ + bool stats_error_printed; + /*!< has persistent statistics error printed + for this index ? */ + /* @} */ + /** Statistics for defragmentation, these numbers are estimations and + could be very inaccurate at certain times, e.g. right after restart, + during defragmentation, etc. */ + /* @{ */ + ulint stat_defrag_modified_counter; + ulint stat_defrag_n_pages_freed; + /* number of pages freed by defragmentation. */ + ulint stat_defrag_n_page_split; + /* number of page splits since last full index + defragmentation. */ + ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE]; + /* data size when compression failure happened + the most recent 10 times. */ + ulint stat_defrag_sample_next_slot; + /* in which slot the next sample should be + saved. */ + /* @} */ +private: + /** R-tree split sequence number */ + Atomic_relaxed<node_seq_t> rtr_ssn; +public: + void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; } + node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; } + node_seq_t ssn() const { return rtr_ssn; } + + rtr_info_track_t* + rtr_track;/*!< tracking all R-Tree search cursors */ + trx_id_t trx_id; /*!< id of the transaction that created this + index, or 0 if the index existed + when InnoDB was started up */ + zip_pad_info_t zip_pad;/*!< Information about state of + compression failures and successes */ + mutable rw_lock_t lock; /*!< read-write lock protecting the + upper levels of the index tree */ + + /** Determine if the index has been committed to the + data dictionary. + @return whether the index definition has been committed */ + bool is_committed() const + { + ut_ad(!uncommitted || !(type & DICT_CLUSTERED)); + return(UNIV_LIKELY(!uncommitted)); + } + + /** Flag an index committed or uncommitted. + @param[in] committed whether the index is committed */ + void set_committed(bool committed) + { + ut_ad(!to_be_dropped); + ut_ad(committed || !(type & DICT_CLUSTERED)); + uncommitted = !committed; + } + + /** Notify that the index pages are going to be modified. + @param[in,out] mtr mini-transaction */ + inline void set_modified(mtr_t& mtr) const; + + /** @return whether this index is readable + @retval true normally + @retval false if this is a single-table tablespace + and the .ibd file is missing, or a + page cannot be read or decrypted */ + inline bool is_readable() const; + + /** @return whether instant ALTER TABLE is in effect */ + inline bool is_instant() const; + + /** @return whether the index is the primary key index + (not the clustered index of the change buffer) */ + bool is_primary() const + { + return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); + } + + /** @return whether this is a generated clustered index */ + bool is_gen_clust() const { return type == DICT_CLUSTERED; } + + /** @return whether this is a clustered index */ + bool is_clust() const { return type & DICT_CLUSTERED; } + + /** @return whether this is a unique index */ + bool is_unique() const { return type & DICT_UNIQUE; } + + /** @return whether this is a spatial index */ + bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); } + + /** @return whether this is the change buffer */ + bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } + + /** @return whether the index includes virtual columns */ + bool has_virtual() const { return type & DICT_VIRTUAL; } + + /** @return the position of DB_TRX_ID */ + uint16_t db_trx_id() const { + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_uniq); + DBUG_ASSERT(n_uniq <= MAX_REF_PARTS); + return n_uniq; + } + /** @return the position of DB_ROLL_PTR */ + uint16_t db_roll_ptr() const + { + return static_cast<uint16_t>(db_trx_id() + 1); + } + + /** @return the offset of the metadata BLOB field, + or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */ + uint16_t first_user_field() const + { + return static_cast<uint16_t>(db_trx_id() + 2); + } + + /** @return whether the index is corrupted */ + inline bool is_corrupted() const; + + /** Detach the virtual columns from the index that is to be removed. */ + void detach_columns() + { + if (!has_virtual() || !cached) + return; + for (unsigned i= 0; i < n_fields; i++) + { + dict_col_t* col= fields[i].col; + if (!col || !col->is_virtual()) + continue; + col->detach(*this); + } + } + + /** Determine how many fields of a given prefix can be set NULL. + @param[in] n_prefix number of fields in the prefix + @return number of fields 0..n_prefix-1 that can be set NULL */ + unsigned get_n_nullable(ulint n_prefix) const + { + DBUG_ASSERT(n_prefix > 0); + DBUG_ASSERT(n_prefix <= n_fields); + unsigned n = n_nullable; + for (; n_prefix < n_fields; n_prefix++) { + const dict_col_t* col = fields[n_prefix].col; + DBUG_ASSERT(!col->is_virtual()); + n -= col->is_nullable(); + } + DBUG_ASSERT(n < n_def); + return n; + } + + /** Get the default value of an instantly-added clustered index field. + @param[in] n instantly added field position + @param[out] len value length (in bytes), or UNIV_SQL_NULL + @return default value + @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ + const byte* instant_field_value(ulint n, ulint* len) const + { + DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID); + DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields); + DBUG_ASSERT(n < n_fields); + return fields[n].col->instant_value(len); + } + + /** Adjust index metadata for instant ADD/DROP/reorder COLUMN. + @param[in] clustered index definition after instant ALTER TABLE */ + inline void instant_add_field(const dict_index_t& instant); + /** Remove instant ADD COLUMN metadata. */ + inline void clear_instant_add(); + /** Remove instant ALTER TABLE metadata. */ + inline void clear_instant_alter(); + + /** Construct the metadata record for instant ALTER TABLE. + @param[in] row dummy or default values for existing columns + @param[in,out] heap memory heap for allocations + @return metadata record */ + inline dtuple_t* + instant_metadata(const dtuple_t& row, mem_heap_t* heap) const; + + /** Check if record in clustered index is historical row. + @param[in] rec clustered row + @param[in] offsets offsets + @return true if row is historical */ + bool + vers_history_row(const rec_t* rec, const rec_offs* offsets); + + /** Check if record in secondary index is historical row. + @param[in] rec record in a secondary index + @param[out] history_row true if row is historical + @return true on error */ + bool + vers_history_row(const rec_t* rec, bool &history_row); + + /** Assign the number of new column to be added as a part + of the index + @param n_vcol number of virtual columns to be added */ + void assign_new_v_col(ulint n_vcol) + { + new_vcol_info= static_cast<dict_add_v_col_info*> + (mem_heap_zalloc(heap, sizeof *new_vcol_info)); + new_vcol_info->n_v_col= n_vcol; + } + + /* @return whether index has new virtual column */ + bool has_new_v_col() const { return new_vcol_info; } + + /* @return number of newly added virtual column */ + ulint get_new_n_vcol() const + { return new_vcol_info ? new_vcol_info->n_v_col : 0; } + + /** Reconstruct the clustered index fields. */ + inline void reconstruct_fields(); + + /** Check if the index contains a column or a prefix of that column. + @param[in] n column number + @param[in] is_virtual whether it is a virtual col + @return whether the index contains the column or its prefix */ + bool contains_col_or_prefix(ulint n, bool is_virtual) const + MY_ATTRIBUTE((warn_unused_result)); + +#ifdef BTR_CUR_HASH_ADAPT + /** @return a clone of this */ + dict_index_t* clone() const; + /** Clone this index for lazy dropping of the adaptive hash index. + @return this or a clone */ + dict_index_t* clone_if_needed(); + /** @return number of leaf pages pointed to by the adaptive hash index */ + inline ulint n_ahi_pages() const; + /** @return whether mark_freed() had been invoked */ + bool freed() const { return UNIV_UNLIKELY(page == 1); } + /** Note that the index is waiting for btr_search_lazy_free() */ + void set_freed() { ut_ad(!freed()); page= 1; } +#endif /* BTR_CUR_HASH_ADAPT */ + + /** @return whether it is forbidden to invoke clear_instant_add() */ + bool must_avoid_clear_instant_add() const + { + if (is_instant()) + for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; ) + if (i->to_be_dropped /* || i->online_log*/) + return true; + return false; + } + + /** This ad-hoc class is used by record_size_info only. */ + class record_size_info_t { + public: + record_size_info_t() + : max_leaf_size(0), shortest_size(0), too_big(false), + first_overrun_field_index(SIZE_T_MAX), overrun_size(0) + { + } + + /** Mark row potentially too big for page and set up first + overflow field index. */ + void set_too_big(size_t field_index) + { + ut_ad(field_index != SIZE_T_MAX); + + too_big = true; + if (first_overrun_field_index > field_index) { + first_overrun_field_index = field_index; + overrun_size = shortest_size; + } + } + + /** @return overrun field index or SIZE_T_MAX if nothing + overflowed*/ + size_t get_first_overrun_field_index() const + { + ut_ad(row_is_too_big()); + ut_ad(first_overrun_field_index != SIZE_T_MAX); + return first_overrun_field_index; + } + + size_t get_overrun_size() const + { + ut_ad(row_is_too_big()); + return overrun_size; + } + + bool row_is_too_big() const { return too_big; } + + size_t max_leaf_size; /** Bigger row size this index can + produce */ + size_t shortest_size; /** shortest because it counts everything + as in overflow pages */ + + private: + bool too_big; /** This one is true when maximum row size this + index can produce is bigger than maximum row + size given page can hold. */ + size_t first_overrun_field_index; /** After adding this field + index row overflowed maximum + allowed size. Useful for + reporting back to user. */ + size_t overrun_size; /** Just overrun row size */ + }; + + /** Returns max possibly record size for that index, size of a shortest + everything in overflow) size of the longest possible row and index + of a field which made index records too big to fit on a page.*/ + inline record_size_info_t record_size_info() const; +}; + +/** Detach a virtual column from an index. +@param index being-freed index */ +inline void dict_col_t::detach(const dict_index_t &index) +{ + if (is_virtual()) + reinterpret_cast<dict_v_col_t*>(this)->detach(index); +} + +/** The status of online index creation */ +enum online_index_status { + /** the index is complete and ready for access */ + ONLINE_INDEX_COMPLETE = 0, + /** the index is being created, online + (allowing concurrent modifications) */ + ONLINE_INDEX_CREATION, + /** secondary index creation was aborted and the index + should be dropped as soon as index->table->n_ref_count reaches 0, + or online table rebuild was aborted and the clustered index + of the original table should soon be restored to + ONLINE_INDEX_COMPLETE */ + ONLINE_INDEX_ABORTED, + /** the online index creation was aborted, the index was + dropped from the data dictionary and the tablespace, and it + should be dropped from the data dictionary cache as soon as + index->table->n_ref_count reaches 0. */ + ONLINE_INDEX_ABORTED_DROPPED +}; + +/** Set to store the virtual columns which are affected by Foreign +key constraint. */ +typedef std::set<dict_v_col_t*, std::less<dict_v_col_t*>, + ut_allocator<dict_v_col_t*> > dict_vcol_set; + +/** Data structure for a foreign key constraint; an example: +FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be +initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */ +struct dict_foreign_t{ + mem_heap_t* heap; /*!< this object is allocated from + this memory heap */ + char* id; /*!< id of the constraint as a + null-terminated string */ + unsigned n_fields:10; /*!< number of indexes' first fields + for which the foreign key + constraint is defined: we allow the + indexes to contain more fields than + mentioned in the constraint, as long + as the first fields are as mentioned */ + unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE + or DICT_FOREIGN_ON_DELETE_SET_NULL */ + char* foreign_table_name;/*!< foreign table name */ + char* foreign_table_name_lookup; + /*!< foreign table name used for dict lookup */ + dict_table_t* foreign_table; /*!< table where the foreign key is */ + const char** foreign_col_names;/*!< names of the columns in the + foreign key */ + char* referenced_table_name;/*!< referenced table name */ + char* referenced_table_name_lookup; + /*!< referenced table name for dict lookup*/ + dict_table_t* referenced_table;/*!< table where the referenced key + is */ + const char** referenced_col_names;/*!< names of the referenced + columns in the referenced table */ + dict_index_t* foreign_index; /*!< foreign index; we require that + both tables contain explicitly defined + indexes for the constraint: InnoDB + does not generate new indexes + implicitly */ + dict_index_t* referenced_index;/*!< referenced index */ + + dict_vcol_set* v_cols; /*!< set of virtual columns affected + by foreign key constraint. */ + + /** Check whether the fulltext index gets affected by + foreign key constraint */ + bool affects_fulltext() const; +}; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign); + +struct dict_foreign_print { + + dict_foreign_print(std::ostream& out) + : m_out(out) + {} + + void operator()(const dict_foreign_t* foreign) { + m_out << *foreign; + } +private: + std::ostream& m_out; +}; + +/** Compare two dict_foreign_t objects using their ids. Used in the ordering +of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns +true if the first argument is considered to go before the second in the +strict weak ordering it defines, and false otherwise. */ +struct dict_foreign_compare { + + bool operator()( + const dict_foreign_t* lhs, + const dict_foreign_t* rhs) const + { + return strcmp(lhs->id, rhs->id) < 0; + } +}; + +/** A function object to find a foreign key with the given index as the +referenced index. Return the foreign key with matching criteria or NULL */ +struct dict_foreign_with_index { + + dict_foreign_with_index(const dict_index_t* index) + : m_index(index) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->referenced_index == m_index); + } + + const dict_index_t* m_index; +}; + +#ifdef WITH_WSREP +/** A function object to find a foreign key with the given index as the +foreign index. Return the foreign key with matching criteria or NULL */ +struct dict_foreign_with_foreign_index { + + dict_foreign_with_foreign_index(const dict_index_t* index) + : m_index(index) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->foreign_index == m_index); + } + + const dict_index_t* m_index; +}; +#endif + +/* A function object to check if the foreign constraint is between different +tables. Returns true if foreign key constraint is between different tables, +false otherwise. */ +struct dict_foreign_different_tables { + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->foreign_table != foreign->referenced_table); + } +}; + +/** A function object to check if the foreign key constraint has the same +name as given. If the full name of the foreign key constraint doesn't match, +then, check if removing the database name from the foreign key constraint +matches. Return true if it matches, false otherwise. */ +struct dict_foreign_matches_id { + + dict_foreign_matches_id(const char* id) + : m_id(id) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + if (0 == innobase_strcasecmp(foreign->id, m_id)) { + return(true); + } + if (const char* pos = strchr(foreign->id, '/')) { + if (0 == innobase_strcasecmp(m_id, pos + 1)) { + return(true); + } + } + return(false); + } + + const char* m_id; +}; + +typedef std::set< + dict_foreign_t*, + dict_foreign_compare, + ut_allocator<dict_foreign_t*> > dict_foreign_set; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set); + +/** Function object to check if a foreign key object is there +in the given foreign key set or not. It returns true if the +foreign key is not found, false otherwise */ +struct dict_foreign_not_exists { + dict_foreign_not_exists(const dict_foreign_set& obj_) + : m_foreigns(obj_) + {} + + /* Return true if the given foreign key is not found */ + bool operator()(dict_foreign_t* const & foreign) const { + return(m_foreigns.find(foreign) == m_foreigns.end()); + } +private: + const dict_foreign_set& m_foreigns; +}; + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set); + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table); + +/*********************************************************************//** +Frees a foreign key struct. */ +inline +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign) /*!< in, own: foreign key struct */ +{ + if (foreign->v_cols != NULL) { + UT_DELETE(foreign->v_cols); + } + + mem_heap_free(foreign->heap); +} + +/** The destructor will free all the foreign key constraints in the set +by calling dict_foreign_free() on each of the foreign key constraints. +This is used to free the allocated memory when a local set goes out +of scope. */ +struct dict_foreign_set_free { + + dict_foreign_set_free(const dict_foreign_set& foreign_set) + : m_foreign_set(foreign_set) + {} + + ~dict_foreign_set_free() + { + std::for_each(m_foreign_set.begin(), + m_foreign_set.end(), + dict_foreign_free); + } + + const dict_foreign_set& m_foreign_set; +}; + +/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that +a foreign key constraint is enforced, therefore RESTRICT just means no flag */ +/* @{ */ +#define DICT_FOREIGN_ON_DELETE_CASCADE 1U /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_DELETE_SET_NULL 2U /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_UPDATE_CASCADE 4U /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8U /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U /*!< ON DELETE NO ACTION */ +#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U /*!< ON UPDATE NO ACTION */ +/* @} */ + +/** Display an identifier. +@param[in,out] s output stream +@param[in] id_name SQL identifier (other than table name) +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const id_name_t& id_name); + +/** Display a table name. +@param[in,out] s output stream +@param[in] table_name table name +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const table_name_t& table_name); + +/** List of locks that different transactions have acquired on a table. This +list has a list node that is embedded in a nested union/structure. We have to +generate a specific template for it. */ + +typedef ut_list_base<lock_t, ut_list_node<lock_t> lock_table_t::*> + table_lock_list_t; + +/** mysql template structure defined in row0mysql.cc */ +struct mysql_row_templ_t; + +/** Structure defines template related to virtual columns and +their base columns */ +struct dict_vcol_templ_t { + /** number of regular columns */ + ulint n_col; + + /** number of virtual columns */ + ulint n_v_col; + + /** array of templates for virtual col and their base columns */ + mysql_row_templ_t** vtempl; + + /** table's database name */ + std::string db_name; + + /** table name */ + std::string tb_name; + + /** MySQL record length */ + ulint rec_len; + + /** default column value if any */ + byte* default_rec; + + /** cached MySQL TABLE object */ + TABLE* mysql_table; + + /** when mysql_table was cached */ + uint64_t mysql_table_query_id; + + dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {} +}; + +/** Metadata on clustered index fields starting from first_user_field() */ +class field_map_element_t +{ + /** Number of bits for representing a column number */ + static constexpr uint16_t IND_BITS = 10; + + /** Set if the column of the field has been instantly dropped */ + static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5); + + /** Set if the column was dropped and originally declared NOT NULL */ + static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4); + + /** Column index (if !(data & DROPPED)): table->cols[data & IND], + or field length (if (data & DROPPED)): + (data & IND) = 0 if variable-length with max_len < 256 bytes; + (data & IND) = 1 if variable-length with max_len > 255 bytes; + (data & IND) = 1 + L otherwise, with L=fixed length of the column */ + static constexpr uint16_t IND = (1U << IND_BITS) - 1; + + /** Field metadata */ + uint16_t data; + + void clear_not_null() { data &= uint16_t(~NOT_NULL); } +public: + bool is_dropped() const { return data & DROPPED; } + void set_dropped() { data |= DROPPED; } + bool is_not_null() const { return data & NOT_NULL; } + void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; } + uint16_t ind() const { return data & IND; } + void set_ind(uint16_t i) + { + DBUG_ASSERT(i <= IND); + DBUG_ASSERT(!ind()); + data |= i; + } + field_map_element_t& operator= (uint16_t value) + { + data = value; + return *this; + } + operator uint16_t() { return data; } +}; + +static_assert(sizeof(field_map_element_t) == 2, + "Size mismatch for a persistent data item!"); + +/** Instantly dropped or reordered columns */ +struct dict_instant_t +{ + /** Number of dropped columns */ + unsigned n_dropped; + /** Dropped columns */ + dict_col_t* dropped; + /** Map of clustered index non-PK fields[i - first_user_field()] + to table columns */ + field_map_element_t* field_map; +}; + +/** These are used when MySQL FRM and InnoDB data dictionary are +in inconsistent state. */ +typedef enum { + DICT_FRM_CONSISTENT = 0, /*!< Consistent state */ + DICT_FRM_NO_PK = 1, /*!< MySQL has no primary key + but InnoDB dictionary has + non-generated one. */ + DICT_NO_PK_FRM_HAS = 2, /*!< MySQL has primary key but + InnoDB dictionary has not. */ + DICT_FRM_INCONSISTENT_KEYS = 3 /*!< Key count mismatch */ +} dict_frm_t; + +/** Data structure for a database table. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_table_create(). */ +struct dict_table_t { + + /** Get reference count. + @return current value of n_ref_count */ + inline uint32_t get_ref_count() const { return n_ref_count; } + + /** Acquire the table handle. */ + inline void acquire(); + + /** Release the table handle. + @return whether the last handle was released */ + inline bool release(); + + /** @return whether the table supports transactions */ + bool no_rollback() const + { + return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK); + } + /** @return whether this is a temporary table */ + bool is_temporary() const + { + return flags2 & DICT_TF2_TEMPORARY; + } + + /** @return whether the table is not in ROW_FORMAT=REDUNDANT */ + bool not_redundant() const { return flags & DICT_TF_COMPACT; } + + /** @return whether this table is readable + @retval true normally + @retval false if this is a single-table tablespace + and the .ibd file is missing, or a + page cannot be read or decrypted */ + bool is_readable() const + { + ut_ad(file_unreadable || space); + return(UNIV_LIKELY(!file_unreadable)); + } + + /** @return whether the table is accessible */ + bool is_accessible() const + { + return UNIV_LIKELY(is_readable() && !corrupted && space) + && !space->is_stopping(); + } + + /** Check if a table name contains the string "/#sql" + which denotes temporary or intermediate tables in MariaDB. */ + static bool is_temporary_name(const char* name) + { + return strstr(name, "/" TEMP_FILE_PREFIX) != NULL; + } + + /** @return whether instant ALTER TABLE is in effect */ + bool is_instant() const + { + return(UT_LIST_GET_FIRST(indexes)->is_instant()); + } + + /** @return whether the table supports instant ALTER TABLE */ + bool supports_instant() const + { + return(!(flags & DICT_TF_MASK_ZIP_SSIZE)); + } + + /** @return the number of instantly dropped columns */ + unsigned n_dropped() const { return instant ? instant->n_dropped : 0; } + + /** Look up an old column. + @param[in] cols the old columns of the table + @param[in] col_map map from old table columns to altered ones + @param[in] n_cols number of old columns + @param[in] i the number of the new column + @return old column + @retval NULL if column i was added to the table */ + static const dict_col_t* find(const dict_col_t* cols, + const ulint* col_map, ulint n_cols, + ulint i) + { + for (ulint o = n_cols; o--; ) { + if (col_map[o] == i) { + return &cols[o]; + } + } + return NULL; + } + + /** Serialise metadata of dropped or reordered columns. + @param[in,out] heap memory heap for allocation + @param[out] field data field with the metadata */ + inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const; + + /** Reconstruct dropped or reordered columns. + @param[in] metadata data from serialise_columns() + @param[in] len length of the metadata, in bytes + @return whether parsing the metadata failed */ + bool deserialise_columns(const byte* metadata, ulint len); + + /** Set is_instant() before instant_column(). + @param[in] old previous table definition + @param[in] col_map map from old.cols[] + and old.v_cols[] to this + @param[out] first_alter_pos 0, or + 1 + first changed column position */ + inline void prepare_instant(const dict_table_t& old, + const ulint* col_map, + unsigned& first_alter_pos); + + /** Adjust table metadata for instant ADD/DROP/reorder COLUMN. + @param[in] table table on which prepare_instant() was invoked + @param[in] col_map mapping from cols[] and v_cols[] to table + @return whether the metadata record must be updated */ + inline bool instant_column(const dict_table_t& table, + const ulint* col_map); + + /** Roll back instant_column(). + @param[in] old_n_cols original n_cols + @param[in] old_cols original cols + @param[in] old_col_names original col_names + @param[in] old_instant original instant structure + @param[in] old_fields original fields + @param[in] old_n_fields original number of fields + @param[in] old_n_core_fields original number of core fields + @param[in] old_n_v_cols original n_v_cols + @param[in] old_v_cols original v_cols + @param[in] old_v_col_names original v_col_names + @param[in] col_map column map */ + inline void rollback_instant( + unsigned old_n_cols, + dict_col_t* old_cols, + const char* old_col_names, + dict_instant_t* old_instant, + dict_field_t* old_fields, + unsigned old_n_fields, + unsigned old_n_core_fields, + unsigned old_n_v_cols, + dict_v_col_t* old_v_cols, + const char* old_v_col_names, + const ulint* col_map); + + /** Add the table definition to the data dictionary cache */ + void add_to_cache(); + + /** @return whether the table is versioned. + It is assumed that both vers_start and vers_end set to 0 + iff table is not versioned. In any other case, + these fields correspond to actual positions in cols[]. */ + bool versioned() const { return vers_start || vers_end; } + bool versioned_by_id() const + { + return versioned() && cols[vers_start].mtype == DATA_INT; + } + + void inc_fk_checks() + { +#ifdef UNIV_DEBUG + int32_t fk_checks= +#endif + n_foreign_key_checks_running++; + ut_ad(fk_checks >= 0); + } + void dec_fk_checks() + { +#ifdef UNIV_DEBUG + int32_t fk_checks= +#endif + n_foreign_key_checks_running--; + ut_ad(fk_checks > 0); + } + + /** For overflow fields returns potential max length stored inline */ + inline size_t get_overflow_field_local_len() const; + + /** Parse the table file name into table name and database name. + @tparam dict_locked whether dict_sys.mutex is being held + @param[in,out] db_name database name buffer + @param[in,out] tbl_name table name buffer + @param[out] db_name_len database name length + @param[out] tbl_name_len table name length + @return whether the table name is visible to SQL */ + template<bool dict_locked= false> + bool parse_name(char (&db_name)[NAME_LEN + 1], + char (&tbl_name)[NAME_LEN + 1], + size_t *db_name_len, size_t *tbl_name_len) const; + +private: + /** Initialize instant->field_map. + @param[in] table table definition to copy from */ + inline void init_instant(const dict_table_t& table); +public: + /** Id of the table. */ + table_id_t id; + /** Hash chain node. */ + hash_node_t id_hash; + /** Table name. */ + table_name_t name; + /** Hash chain node. */ + hash_node_t name_hash; + + /** Memory heap */ + mem_heap_t* heap; + + /** NULL or the directory path specified by DATA DIRECTORY. */ + char* data_dir_path; + + /** The tablespace of the table */ + fil_space_t* space; + /** Tablespace ID */ + ulint space_id; + + /** Stores information about: + 1 row format (redundant or compact), + 2 compressed page size (zip shift size), + 3 whether using atomic blobs, + 4 whether the table has been created with the option DATA DIRECTORY. + Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(), + DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this + flag. */ + unsigned flags:DICT_TF_BITS; + + /** Stores information about: + 1 whether the table has been created using CREATE TEMPORARY TABLE, + 2 whether the table has an internally defined DOC ID column, + 3 whether the table has a FTS index, + 4 whether DOC ID column need to be added to the FTS index, + 5 whether the table is being created its own tablespace, + 6 whether the table has been DISCARDed, + 7 whether the aux FTS tables names are in hex. + Use DICT_TF2_FLAG_IS_SET() to parse this flag. */ + unsigned flags2:DICT_TF2_BITS; + + /** TRUE if the table is an intermediate table during copy alter + operation or a partition/subpartition which is required for copying + data and skip the undo log for insertion of row in the table. + This variable will be set and unset during extra(), or during the + process of altering partitions */ + unsigned skip_alter_undo:1; + + /*!< whether this is in a single-table tablespace and the .ibd + file is missing or page decryption failed and page is corrupted */ + unsigned file_unreadable:1; + + /** TRUE if the table object has been added to the dictionary cache. */ + unsigned cached:1; + + /** TRUE if the table is to be dropped, but not yet actually dropped + (could in the background drop list). It is turned on at the beginning + of row_drop_table_for_mysql() and turned off just before we start to + update system tables for the drop. It is protected by dict_sys.latch. */ + unsigned to_be_dropped:1; + + /** Number of non-virtual columns defined so far. */ + unsigned n_def:10; + + /** Number of non-virtual columns. */ + unsigned n_cols:10; + + /** Number of total columns (inlcude virtual and non-virtual) */ + unsigned n_t_cols:10; + + /** Number of total columns defined so far. */ + unsigned n_t_def:10; + + /** Number of virtual columns defined so far. */ + unsigned n_v_def:10; + + /** Number of virtual columns. */ + unsigned n_v_cols:10; + + /** 1 + the position of autoinc counter field in clustered + index, or 0 if there is no persistent AUTO_INCREMENT column in + the table. */ + unsigned persistent_autoinc:10; + + /** TRUE if it's not an InnoDB system table or a table that has no FK + relationships. */ + unsigned can_be_evicted:1; + + /** TRUE if table is corrupted. */ + unsigned corrupted:1; + + /** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED + or ONLINE_INDEX_ABORTED_DROPPED. */ + unsigned drop_aborted:1; + + /** Array of column descriptions. */ + dict_col_t* cols; + + /** Array of virtual column descriptions. */ + dict_v_col_t* v_cols; + + /** List of stored column descriptions. It is used only for foreign key + check during create table and copy alter operations. + During copy alter, s_cols list is filled during create table operation + and need to preserve till rename table operation. That is the + reason s_cols is a part of dict_table_t */ + dict_s_col_list* s_cols; + + /** Instantly dropped or reordered columns, or NULL if none */ + dict_instant_t* instant; + + /** Column names packed in a character string + "name1\0name2\0...nameN\0". Until the string contains n_cols, it will + be allocated from a temporary heap. The final string will be allocated + from table->heap. */ + const char* col_names; + + /** Virtual column names */ + const char* v_col_names; + unsigned vers_start:10; + /*!< System Versioning: row start col index */ + unsigned vers_end:10; + /*!< System Versioning: row end col index */ + bool is_system_db; + /*!< True if the table belongs to a system + database (mysql, information_schema or + performance_schema) */ + dict_frm_t dict_frm_mismatch; + /*!< !DICT_FRM_CONSISTENT==0 if data + dictionary information and + MySQL FRM information mismatch. */ + /** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */ + dict_index_t* fts_doc_id_index; + + /** List of indexes of the table. */ + UT_LIST_BASE_NODE_T(dict_index_t) indexes; +#ifdef BTR_CUR_HASH_ADAPT + /** List of detached indexes that are waiting to be freed along with + the last adaptive hash index entry */ + UT_LIST_BASE_NODE_T(dict_index_t) freed_indexes; +#endif /* BTR_CUR_HASH_ADAPT */ + + /** List of foreign key constraints in the table. These refer to + columns in other tables. */ + UT_LIST_BASE_NODE_T(dict_foreign_t) foreign_list; + + /** List of foreign key constraints which refer to this table. */ + UT_LIST_BASE_NODE_T(dict_foreign_t) referenced_list; + + /** Node of the LRU list of tables. */ + UT_LIST_NODE_T(dict_table_t) table_LRU; + + /** Maximum recursive level we support when loading tables chained + together with FK constraints. If exceeds this level, we will stop + loading child table into memory along with its parent table. */ + unsigned fk_max_recusive_level:8; + + /** Count of how many foreign key check operations are currently being + performed on the table. We cannot drop the table while there are + foreign key checks running on it. */ + Atomic_counter<int32_t> n_foreign_key_checks_running; + + /** Transactions whose view low limit is greater than this number are + not allowed to store to the MySQL query cache or retrieve from it. + When a trx with undo logs commits, it sets this to the value of the + transaction id. */ + trx_id_t query_cache_inv_trx_id; + + /** Transaction id that last touched the table definition. Either when + loading the definition or CREATE TABLE, or ALTER TABLE (prepare, + commit, and rollback phases). */ + trx_id_t def_trx_id; + + /*!< set of foreign key constraints in the table; these refer to + columns in other tables */ + dict_foreign_set foreign_set; + + /*!< set of foreign key constraints which refer to this table */ + dict_foreign_set referenced_set; + + /** Statistics for query optimization. Mostly protected by + dict_sys.mutex. @{ */ + + /** TRUE if statistics have been calculated the first time after + database startup or table creation. */ + unsigned stat_initialized:1; + + /** Timestamp of last recalc of the stats. */ + time_t stats_last_recalc; + + /** The two bits below are set in the 'stat_persistent' member. They + have the following meaning: + 1. _ON=0, _OFF=0, no explicit persistent stats setting for this table, + the value of the global srv_stats_persistent is used to determine + whether the table has persistent stats enabled or not + 2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this + table, regardless of the value of the global srv_stats_persistent + 3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this + table, regardless of the value of the global srv_stats_persistent + 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */ + #define DICT_STATS_PERSISTENT_ON (1 << 1) + #define DICT_STATS_PERSISTENT_OFF (1 << 2) + + /** Indicates whether the table uses persistent stats or not. See + DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */ + ib_uint32_t stat_persistent; + + /** The two bits below are set in the 'stats_auto_recalc' member. They + have the following meaning: + 1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the + value of the global srv_stats_persistent_auto_recalc is used to + determine whether the table has auto recalc enabled or not + 2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table, + regardless of the value of the global srv_stats_persistent_auto_recalc + 3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table, + regardless of the value of the global srv_stats_persistent_auto_recalc + 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */ + #define DICT_STATS_AUTO_RECALC_ON (1 << 1) + #define DICT_STATS_AUTO_RECALC_OFF (1 << 2) + + /** Indicates whether the table uses automatic recalc for persistent + stats or not. See DICT_STATS_AUTO_RECALC_ON and + DICT_STATS_AUTO_RECALC_OFF. */ + ib_uint32_t stats_auto_recalc; + + /** The number of pages to sample for this table during persistent + stats estimation. If this is 0, then the value of the global + srv_stats_persistent_sample_pages will be used instead. */ + ulint stats_sample_pages; + + /** Approximate number of rows in the table. We periodically calculate + new estimates. */ + ib_uint64_t stat_n_rows; + + /** Approximate clustered index size in database pages. */ + ulint stat_clustered_index_size; + + /** Approximate size of other indexes in database pages. */ + ulint stat_sum_of_other_index_sizes; + + /** How many rows are modified since last stats recalc. When a row is + inserted, updated, or deleted, we add 1 to this number; we calculate + new estimates for the table and the indexes if the table has changed + too much, see dict_stats_update_if_needed(). The counter is reset + to zero at statistics calculation. This counter is not protected by + any latch, because this is only used for heuristics. */ + ib_uint64_t stat_modified_counter; + + /** Background stats thread is not working on this table. */ + #define BG_STAT_NONE 0 + + /** Set in 'stats_bg_flag' when the background stats code is working + on this table. The DROP TABLE code waits for this to be cleared before + proceeding. */ + #define BG_STAT_IN_PROGRESS (1 << 0) + + /** Set in 'stats_bg_flag' when DROP TABLE starts waiting on + BG_STAT_IN_PROGRESS to be cleared. The background stats thread will + detect this and will eventually quit sooner. */ + #define BG_STAT_SHOULD_QUIT (1 << 1) + + /** The state of the background stats thread wrt this table. + See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT. + Writes are covered by dict_sys.mutex. Dirty reads are possible. */ + byte stats_bg_flag; + + bool stats_error_printed; + /*!< Has persistent stats error beein + already printed for this table ? */ + /* @} */ + + /** AUTOINC related members. @{ */ + + /* The actual collection of tables locked during AUTOINC read/write is + kept in trx_t. In order to quickly determine whether a transaction has + locked the AUTOINC lock we keep a pointer to the transaction here in + the 'autoinc_trx' member. This is to avoid acquiring the + lock_sys_t::mutex and scanning the vector in trx_t. + When an AUTOINC lock has to wait, the corresponding lock instance is + created on the trx lock heap rather than use the pre-allocated instance + in autoinc_lock below. */ + + /** A buffer for an AUTOINC lock for this table. We allocate the + memory here so that individual transactions can get it and release it + without a need to allocate space from the lock heap of the trx: + otherwise the lock heap would grow rapidly if we do a large insert + from a select. */ + lock_t* autoinc_lock; + + /** Mutex protecting the autoincrement counter. */ + std::mutex autoinc_mutex; + + /** Autoinc counter value to give to the next inserted row. */ + ib_uint64_t autoinc; + + /** This counter is used to track the number of granted and pending + autoinc locks on this table. This value is set after acquiring the + lock_sys_t::mutex but we peek the contents to determine whether other + transactions have acquired the AUTOINC lock or not. Of course only one + transaction can be granted the lock but there can be multiple + waiters. */ + ulong n_waiting_or_granted_auto_inc_locks; + + /** The transaction that currently holds the the AUTOINC lock on this + table. Protected by lock_sys.mutex. */ + const trx_t* autoinc_trx; + + /* @} */ + + /** FTS specific state variables. */ + fts_t* fts; + + /** Quiescing states, protected by the dict_index_t::lock. ie. we can + only change the state if we acquire all the latches (dict_index_t::lock) + in X mode of this table's indexes. */ + ib_quiesce_t quiesce; + + /** Count of the number of record locks on this table. We use this to + determine whether we can evict the table from the dictionary cache. + It is protected by lock_sys.mutex. */ + ulint n_rec_locks; + +private: + /** Count of how many handles are opened to this table. Dropping of the + table is NOT allowed until this count gets to zero. MySQL does NOT + itself check the number of open handles at DROP. */ + Atomic_counter<uint32_t> n_ref_count; + +public: + /** List of locks on the table. Protected by lock_sys.mutex. */ + table_lock_list_t locks; + + /** Timestamp of the last modification of this table. */ + time_t update_time; + +#ifdef UNIV_DEBUG + /** Value of 'magic_n'. */ + #define DICT_TABLE_MAGIC_N 76333786 + + /** Magic number. */ + ulint magic_n; +#endif /* UNIV_DEBUG */ + /** mysql_row_templ_t for base columns used for compute the virtual + columns */ + dict_vcol_templ_t* vc_templ; + + /* @return whether the table has any other transcation lock + other than the given transaction */ + bool has_lock_other_than(const trx_t *trx) const + { + for (lock_t *lock= UT_LIST_GET_FIRST(locks); lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + if (lock->trx != trx) + return true; + return false; + } + + /** Check whether the table name is same as mysql/innodb_stats_table + or mysql/innodb_index_stats. + @return true if the table name is same as stats table */ + bool is_stats_table() const; +}; + +inline void dict_index_t::set_modified(mtr_t& mtr) const +{ + mtr.set_named_space(table->space); +} + +inline bool table_name_t::is_temporary() const +{ + return dict_table_t::is_temporary_name(m_name); +} + +inline bool dict_index_t::is_readable() const { return table->is_readable(); } + +inline bool dict_index_t::is_instant() const +{ + ut_ad(n_core_fields > 0); + ut_ad(n_core_fields <= n_fields || table->n_dropped()); + ut_ad(n_core_fields == n_fields + || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED); + ut_ad(n_core_fields == n_fields || table->supports_instant()); + ut_ad(n_core_fields == n_fields || !table->is_temporary()); + ut_ad(!table->instant || !table->is_temporary()); + + return n_core_fields != n_fields + || (is_primary() && table->instant); +} + +inline bool dict_index_t::is_corrupted() const +{ + return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED + || (type & DICT_CORRUPT) + || (table && table->corrupted)); +} + +inline void dict_index_t::clear_instant_add() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(is_instant()); + DBUG_ASSERT(!table->instant); + for (unsigned i= n_core_fields; i < n_fields; i++) + fields[i].col->clear_instant(); + n_core_fields= n_fields; + n_core_null_bytes= static_cast<byte> + (UT_BITS_IN_BYTES(static_cast<unsigned>(n_nullable))); +} + +inline void dict_index_t::clear_instant_alter() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_fields == n_def); + + if (!table->instant) { + if (is_instant()) { + clear_instant_add(); + } + return; + } + +#ifndef DBUG_OFF + for (unsigned i = first_user_field(); i--; ) { + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(!fields[i].col->is_nullable()); + } +#endif + const dict_col_t* ai_col = table->persistent_autoinc + ? fields[table->persistent_autoinc - 1].col + : NULL; + dict_field_t* const begin = &fields[first_user_field()]; + dict_field_t* end = &fields[n_fields]; + + for (dict_field_t* d = begin; d < end; ) { + /* Move fields for dropped columns to the end. */ + if (!d->col->is_dropped()) { + d++; + } else { + if (d->col->is_nullable()) { + n_nullable--; + } + + std::swap(*d, *--end); + } + } + + DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end); + n_core_fields = n_fields = n_def + = static_cast<unsigned>(end - fields) & MAX_N_FIELDS; + n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_nullable)); + std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b) + { return a.col->ind < b.col->ind; }); + table->instant = NULL; + if (ai_col) { + auto a = std::find_if(begin, end, + [ai_col](const dict_field_t& f) + { return f.col == ai_col; }); + table->persistent_autoinc = (a == end) + ? 0 + : (1 + static_cast<unsigned>(a - fields)) + & MAX_N_FIELDS; + } +} + +/** @return whether the column was instantly dropped +@param[in] index the clustered index */ +inline bool dict_col_t::is_dropped(const dict_index_t& index) const +{ + DBUG_ASSERT(index.is_primary()); + DBUG_ASSERT(!is_dropped() == !index.table->instant); + DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped + && this < index.table->instant->dropped + + index.table->instant->n_dropped)); + return is_dropped(); +} + +/*******************************************************************//** +Initialise the table lock list. */ +void +lock_table_lock_list_init( +/*======================*/ + table_lock_list_t* locks); /*!< List to initialise */ + +/** A function object to add the foreign key constraint to the referenced set +of the referenced table, if it exists in the dictionary cache. */ +struct dict_foreign_add_to_referenced_table { + void operator()(dict_foreign_t* foreign) const + { + if (dict_table_t* table = foreign->referenced_table) { + std::pair<dict_foreign_set::iterator, bool> ret + = table->referenced_set.insert(foreign); + ut_a(ret.second); + } + } +}; + +/** Check whether the col is used in spatial index or regular index. +@param[in] col column to check +@return spatial status */ +inline +spatial_status_t +dict_col_get_spatial_status( + const dict_col_t* col) +{ + spatial_status_t spatial_status = SPATIAL_NONE; + + /* Column is not a part of any index. */ + if (!col->ord_part) { + return(spatial_status); + } + + if (DATA_GEOMETRY_MTYPE(col->mtype)) { + if (col->max_prefix == 0) { + spatial_status = SPATIAL_ONLY; + } else { + /* Any regular index on a geometry column + should have a prefix. */ + spatial_status = SPATIAL_MIXED; + } + } + + return(spatial_status); +} + +/** Clear defragmentation summary. */ +inline void dict_stats_empty_defrag_summary(dict_index_t* index) +{ + index->stat_defrag_n_pages_freed = 0; +} + +/** Clear defragmentation related index stats. */ +inline void dict_stats_empty_defrag_stats(dict_index_t* index) +{ + index->stat_defrag_modified_counter = 0; + index->stat_defrag_n_page_split = 0; +} + +#include "dict0mem.ic" + +#endif /* dict0mem_h */ diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic new file mode 100644 index 00000000..0a554a54 --- /dev/null +++ b/storage/innobase/include/dict0mem.ic @@ -0,0 +1,73 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0mem.ic +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "data0type.h" +#include "dict0mem.h" +#include "fil0fil.h" + +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + + if (heap) { + index->heap = heap; + index->name = mem_heap_strdup(heap, index_name); + index->fields = (dict_field_t*) mem_heap_alloc( + heap, 1 + n_fields * sizeof(dict_field_t)); + } else { + index->name = index_name; + index->heap = NULL; + index->fields = NULL; + } + + index->type = type & ((1U << DICT_IT_BITS) - 1); + index->page = FIL_NULL; + index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + index->n_fields = static_cast<unsigned>(n_fields) + & index->MAX_N_FIELDS; + index->n_core_fields = static_cast<unsigned>(n_fields) + & index->MAX_N_FIELDS; + /* The '1 +' above prevents allocation + of an empty mem block */ + index->nulls_equal = false; +#ifdef BTR_CUR_HASH_ADAPT +#ifdef MYSQL_INDEX_DISABLE_AHI + index->disable_ahi = false; +#endif +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(index->magic_n = DICT_INDEX_MAGIC_N); +} diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h new file mode 100644 index 00000000..dfa6f2a2 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.h @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +#include "dict0pagecompress.ic" + +#endif diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic new file mode 100644 index 00000000..c959f9ca --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.ic @@ -0,0 +1,81 @@ +/***************************************************************************** + +Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h new file mode 100644 index 00000000..3f279205 --- /dev/null +++ b/storage/innobase/include/dict0priv.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0priv.h +Data dictionary private functions + +Created Fri 2 Jul 2010 13:30:38 EST - Sunny Bains +*******************************************************/ + +#ifndef dict0priv_h +#define dict0priv_h + +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. Note: Not to be called from outside dict0*c functions. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + const char* table_name); /*!< in: table name */ + +/**********************************************************************//** +Checks if a table is in the dictionary cache. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + const char* table_name); /*!< in: table name */ + +#include "dict0priv.ic" + +#endif /* dict0priv.h */ diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic new file mode 100644 index 00000000..2fcadc05 --- /dev/null +++ b/storage/innobase/include/dict0priv.ic @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0priv.ic +Data dictionary system private include file + +Created Wed 13 Oct 2010 16:10:14 EST Sunny Bains +***********************************************************************/ + +#include "dict0dict.h" +#include "dict0load.h" + +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + const char* table_name) /*!< in: table name */ +{ + dict_table_t* table; + + ut_ad(table_name); + ut_ad(mutex_own(&dict_sys.mutex)); + + table = dict_table_check_if_in_cache_low(table_name); + + if (table && table->corrupted) { + ib::error error; + error << "Table " << table->name << "is corrupted"; + if (srv_load_corrupted) { + error << ", but innodb_force_load_corrupted is set"; + } else { + return(NULL); + } + } + + if (table == NULL) { + table = dict_load_table(table_name, DICT_ERR_IGNORE_NONE); + } + + ut_ad(!table || table->cached); + + return(table); +} + +/**********************************************************************//** +Checks if a table is in the dictionary cache. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + const char* table_name) /*!< in: table name */ +{ + dict_table_t* table; + ulint table_fold; + + DBUG_ENTER("dict_table_check_if_in_cache_low"); + DBUG_PRINT("dict_table_check_if_in_cache_low", + ("table: '%s'", table_name)); + + ut_ad(table_name); + ut_ad(mutex_own(&dict_sys.mutex)); + + /* Look for the table name in the hash table */ + table_fold = ut_fold_string(table_name); + + HASH_SEARCH(name_hash, &dict_sys.table_hash, table_fold, + dict_table_t*, table, ut_ad(table->cached), + !strcmp(table->name.m_name, table_name)); + DBUG_RETURN(table); +} diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h new file mode 100644 index 00000000..cf0e2ada --- /dev/null +++ b/storage/innobase/include/dict0stats.h @@ -0,0 +1,251 @@ +/***************************************************************************** + +Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.h +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_h +#define dict0stats_h + +#include "dict0types.h" +#include "trx0types.h" + +#define TABLE_STATS_NAME "mysql/innodb_table_stats" +#define INDEX_STATS_NAME "mysql/innodb_index_stats" + +enum dict_stats_upd_option_t { + DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the + statistics using a precise and slow + algo and save them to the persistent + storage, if the persistent storage is + not present then emit a warning and + fall back to transient stats */ + DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics + using an imprecise quick algo + without saving the results + persistently */ + DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense) + into a table and its indexes' statistics + members. The resulting stats correspond to an + empty table. If the table is using persistent + statistics, then they are saved on disk. */ + DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats + from the persistent storage if the in-memory + structures have not been initialized yet, + otherwise do nothing */ +}; + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ + MY_ATTRIBUTE((nonnull)); + +/** @return whether persistent statistics is enabled for a given table */ +UNIV_INLINE +bool +dict_stats_is_persistent_enabled(const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off); /*!< in: explicitly disabled */ + +/** @return whether auto recalc is enabled for a given table*/ +UNIV_INLINE +bool +dict_stats_auto_recalc_is_enabled(const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ + MY_ATTRIBUTE((nonnull)); + +#ifdef WITH_WSREP +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table +@param[in] trx transaction */ +void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx) + MY_ATTRIBUTE((nonnull)); +#else +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table */ +void dict_stats_update_if_needed_func(dict_table_t *table) + MY_ATTRIBUTE((nonnull)); +# define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t) +#endif + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_* error code or DB_SUCCESS */ +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option); + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent storage */ + +/** Remove the information for a particular index's stats from the persistent +storage if it exists and if there is data stored for this index. +This function creates its own trx and commits it. + +We must modify system tables in a separate transaction in order to +adhere to the InnoDB design constraint that dict_sys.latch prevents +lock waits on system tables. If we modified system and user tables in +the same transaction, we should exclusively hold dict_sys.latch until +the transaction is committed, and effectively block other transactions +that will attempt to open any InnoDB tables. Because we have no +guarantee that user transactions will be committed fast, we cannot +afford to keep the system tables locked in a user transaction. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_drop_index( +/*==================*/ + const char* tname, /*!< in: table name */ + const char* iname, /*!< in: index name */ + char* errstr, /*!< out: error message if != DB_SUCCESS + is returned */ + ulint errstr_sz);/*!< in: size of the errstr buffer */ + +/*********************************************************************//** +Removes the statistics for a table and all of its indexes from the +persistent storage if it exists and if there is data stored for the table. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_drop_table( +/*==================*/ + const char* table_name, /*!< in: table name */ + char* errstr, /*!< out: error message + if != DB_SUCCESS is returned */ + ulint errstr_sz); /*!< in: size of errstr buffer */ + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Renames a table in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_rename_table( +/*====================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + char* errstr, /*!< out: error string if != DB_SUCCESS + is returned */ + size_t errstr_sz); /*!< in: errstr size */ +/*********************************************************************//** +Renames an index in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned +if the persistent stats do not exist. */ +dberr_t +dict_stats_rename_index( +/*====================*/ + const dict_table_t* table, /*!< in: table whose index + is renamed */ + const char* old_index_name, /*!< in: old index name */ + const char* new_index_name) /*!< in: new index name */ + __attribute__((warn_unused_result)); + +/** Save an individual index's statistic into the persistent statistics +storage. +@param[in] index index to be updated +@param[in] last_update timestamp of the stat +@param[in] stat_name name of the stat +@param[in] stat_value value of the stat +@param[in] sample_size n pages sampled or NULL +@param[in] stat_description description of the stat +@param[in,out] trx in case of NULL the function will +allocate and free the trx object. If it is not NULL then it will be +rolled back only in the case of error, but not freed. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_index_stat( + dict_index_t* index, + time_t last_update, + const char* stat_name, + ib_uint64_t stat_value, + ib_uint64_t* sample_size, + const char* stat_description, + trx_t* trx); + +/** Report an error if updating table statistics failed because +.ibd file is missing, table decryption failed or table is corrupted. +@param[in,out] table Table +@param[in] defragment true if statistics is for defragment +@retval DB_DECRYPTION_FAILED if decryption of the table failed +@retval DB_TABLESPACE_DELETED if .ibd file is missing +@retval DB_CORRUPTION if table is marked as corrupted */ +dberr_t +dict_stats_report_error(dict_table_t* table, bool defragment = false) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#include "dict0stats.ic" + +#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS +void test_dict_stats_all(); +#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ + +#endif /* dict0stats_h */ diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic new file mode 100644 index 00000000..4972efe8 --- /dev/null +++ b/storage/innobase/include/dict0stats.ic @@ -0,0 +1,221 @@ +/***************************************************************************** + +Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.ic +Code used for calculating and manipulating table statistics. + +Created Jan 23, 2012 Vasil Dimov +*******************************************************/ + +#include "dict0dict.h" +#include "srv0srv.h" + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ +{ + /* Not allowed to have both flags set, but a CREATE or ALTER + statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would + end up having both set. In this case we clear the OFF flag. */ + if (ps_on && ps_off) { + ps_off = FALSE; + } + + ib_uint32_t stat_persistent = 0; + + if (ps_on) { + stat_persistent |= DICT_STATS_PERSISTENT_ON; + } + + if (ps_off) { + stat_persistent |= DICT_STATS_PERSISTENT_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stat_persistent = stat_persistent; +} + +/** @return whether persistent statistics is enabled for a given table */ +UNIV_INLINE +bool +dict_stats_is_persistent_enabled(const dict_table_t* table) +{ + /* Because of the nature of this check (non-locking) it is possible + that a table becomes: + * PS-disabled immediately after this function has returned TRUE or + * PS-enabled immediately after this function has returned FALSE. + This means that it is possible that we do: + + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has + just been PS-disabled or + + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has + just been PS-enabled. + This is acceptable. Avoiding this would mean that we would have to + protect the stat_persistent with dict_sys.mutex like the + other ::stat_ members which would be too big performance penalty, + especially when this function is called from + dict_stats_update_if_needed(). */ + + /* we rely on this read to be atomic */ + ib_uint32_t stat_persistent = table->stat_persistent; + + if (stat_persistent & DICT_STATS_PERSISTENT_ON) { + ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF)); + return(true); + } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) { + return(false); + } else { + return(srv_stats_persistent); + } +} + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off) /*!< in: explicitly disabled */ +{ + ut_ad(!auto_recalc_on || !auto_recalc_off); + + ib_uint32_t stats_auto_recalc = 0; + + if (auto_recalc_on) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON; + } + + if (auto_recalc_off) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stats_auto_recalc = stats_auto_recalc; +} + +/** @return whether auto recalc is enabled for a given table*/ +UNIV_INLINE +bool +dict_stats_auto_recalc_is_enabled(const dict_table_t* table) +{ + /* we rely on this read to be atomic */ + ib_uint32_t stats_auto_recalc = table->stats_auto_recalc; + + if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) { + ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF)); + return(true); + } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) { + return(false); + } else { + return(srv_stats_auto_recalc); + } +} + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!mutex_own(&dict_sys.mutex)); + + if (table->stat_initialized) { + return; + } + + dict_stats_upd_option_t opt; + + if (dict_stats_is_persistent_enabled(table)) { + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + dict_stats_update(table, opt); +} + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(mutex_own(&dict_sys.mutex)); + + ut_a(table->get_ref_count() == 0); + + if (!table->stat_initialized) { + return; + } + + table->stat_initialized = FALSE; + +#ifdef HAVE_valgrind + MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows); + MEM_UNDEFINED(&table->stat_clustered_index_size, + sizeof table->stat_clustered_index_size); + MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes, + sizeof table->stat_sum_of_other_index_sizes); + MEM_UNDEFINED(&table->stat_modified_counter, + sizeof table->stat_modified_counter); + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + MEM_UNDEFINED( + index->stat_n_diff_key_vals, + index->n_uniq + * sizeof index->stat_n_diff_key_vals[0]); + MEM_UNDEFINED( + index->stat_n_sample_sizes, + index->n_uniq + * sizeof index->stat_n_sample_sizes[0]); + MEM_UNDEFINED( + index->stat_n_non_null_key_vals, + index->n_uniq + * sizeof index->stat_n_non_null_key_vals[0]); + MEM_UNDEFINED( + &index->stat_index_size, + sizeof(index->stat_index_size)); + MEM_UNDEFINED( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); + } +#endif /* HAVE_valgrind */ +} diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h new file mode 100644 index 00000000..b210a2ec --- /dev/null +++ b/storage/innobase/include/dict0stats_bg.h @@ -0,0 +1,122 @@ +/***************************************************************************** + +Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.h +Code used for background table and index stats gathering. + +Created Apr 26, 2012 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_bg_h +#define dict0stats_bg_h + +#include "dict0types.h" +#include "os0thread.h" + +#ifdef HAVE_PSI_INTERFACE +extern mysql_pfs_key_t dict_stats_recalc_pool_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ + +#ifdef UNIV_DEBUG +/** Value of MySQL global used to disable dict_stats thread. */ +extern my_bool innodb_dict_stats_disabled_debug; +#endif /* UNIV_DEBUG */ + +/*****************************************************************//** +Delete a given table from the auto recalc pool. +dict_stats_recalc_pool_del() */ +void +dict_stats_recalc_pool_del( +/*=======================*/ + const dict_table_t* table); /*!< in: table to remove */ + +/** Yield the data dictionary latch when waiting +for the background thread to stop accessing a table. +@param trx transaction holding the data dictionary locks */ +#define DICT_BG_YIELD(trx) do { \ + row_mysql_unlock_data_dictionary(trx); \ + os_thread_sleep(250000); \ + row_mysql_lock_data_dictionary(trx); \ +} while (0) + +/*****************************************************************//** +Request the background collection of statistics to stop for a table. +@retval true when no background process is active +@retval false when it is not safe to modify the table definition */ +UNIV_INLINE +bool +dict_stats_stop_bg( +/*===============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys.mutex)); + + if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) { + return(true); + } + + table->stats_bg_flag |= BG_STAT_SHOULD_QUIT; + return(false); +} + +/*****************************************************************//** +Wait until background stats thread has stopped using the specified table. +The caller must have locked the data dictionary using +row_mysql_lock_data_dictionary() and this function may unlock it temporarily +and restore the lock before it exits. +The background stats thread is guaranteed not to start using the specified +table after this function returns and before the caller unlocks the data +dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag +under dict_sys.mutex. */ +void +dict_stats_wait_bg_to_stop_using_table( +/*===================================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx); /*!< in/out: transaction to use for + unlocking/locking the data dict */ +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread(). +Must be called before dict_stats task is started. */ +void dict_stats_init(); + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats task has exited. */ +void dict_stats_deinit(); + +#ifdef UNIV_DEBUG +/** Disables dict stats thread. It's used by: + SET GLOBAL innodb_dict_stats_disabled_debug = 1 (0). +@param[in] save immediate result from check function */ +void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save); +#endif /* UNIV_DEBUG */ + +/** Start the dict stats timer. */ +void dict_stats_start(); + +/** Shut down the dict_stats timer. */ +void dict_stats_shutdown(); + +/** Reschedule dict stats timer to run now. */ +void dict_stats_schedule_now(); + +#endif /* dict0stats_bg_h */ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h new file mode 100644 index 00000000..d0da45ab --- /dev/null +++ b/storage/innobase/include/dict0types.h @@ -0,0 +1,177 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0types.h +Data dictionary global types + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0types_h +#define dict0types_h + +#include <ut0mutex.h> +#include <rem0types.h> + +struct dict_col_t; +struct dict_field_t; +struct dict_index_t; +struct dict_table_t; +struct dict_foreign_t; +struct dict_v_col_t; + +struct ind_node_t; +struct tab_node_t; +struct dict_add_v_col_t; + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ibuf table and indexes's ID are assigned as the number +DICT_IBUF_ID_MIN plus the space id */ +#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL + +typedef ib_id_t table_id_t; +typedef ib_id_t index_id_t; + +/** Maximum transaction identifier */ +#define TRX_ID_MAX IB_ID_MAX + +/** The bit pattern corresponding to TRX_ID_MAX */ +extern const byte trx_id_max_bytes[8]; +extern const byte timestamp_max_bytes[7]; + +/** Error to ignore when we load table dictionary into memory. However, +the table and index will be marked as "corrupted", and caller will +be responsible to deal with corrupted table or index. +Note: please define the IGNORE_ERR_* as bits, so their value can +be or-ed together */ +enum dict_err_ignore_t { + DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */ + DICT_ERR_IGNORE_FK_NOKEY = 1, /*!< ignore error if any foreign + key is missing */ + DICT_ERR_IGNORE_INDEX_ROOT = 2, /*!< ignore error if index root + page is FIL_NULL or incorrect value */ + DICT_ERR_IGNORE_CORRUPT = 4, /*!< skip corrupted indexes */ + DICT_ERR_IGNORE_RECOVER_LOCK = 8, + /*!< Used when recovering table locks + for resurrected transactions. + Silently load a missing + tablespace, and do not load + incomplete index definitions. */ + /** ignore all errors above */ + DICT_ERR_IGNORE_ALL = 15, + /** prepare to drop the table; do not attempt to load tablespace */ + DICT_ERR_IGNORE_DROP = 31 +}; + +/** Quiescing states for flushing tables to disk. */ +enum ib_quiesce_t { + QUIESCE_NONE, + QUIESCE_START, /*!< Initialise, prepare to start */ + QUIESCE_COMPLETE /*!< All done */ +}; + +#ifndef UNIV_INNOCHECKSUM +typedef ib_mutex_t DictSysMutex; +#endif /* !UNIV_INNOCHECKSUM */ + +/** Prefix for tmp tables, adopted from sql/table.h */ +#define TEMP_FILE_PREFIX "#sql" +#define TEMP_FILE_PREFIX_LENGTH 4 +#define TEMP_FILE_PREFIX_INNODB "#sql-ib" + +#define TEMP_TABLE_PREFIX "#sql" +#define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX + +/** Table name wrapper for pretty-printing */ +struct table_name_t +{ + /** The name in internal representation */ + char* m_name; + + /** Default constructor */ + table_name_t() {} + /** Constructor */ + table_name_t(char* name) : m_name(name) {} + + /** @return the end of the schema name */ + const char* dbend() const + { + const char* sep = strchr(m_name, '/'); + ut_ad(sep); + return sep; + } + + /** @return the length of the schema name, in bytes */ + size_t dblen() const { return size_t(dbend() - m_name); } + + /** Determine the filename-safe encoded table name. + @return the filename-safe encoded table name */ + const char* basename() const { return dbend() + 1; } + + /** The start of the table basename suffix for partitioned tables */ + static const char part_suffix[4]; + + /** Determine the partition or subpartition name suffix. + @return the partition name + @retval NULL if the table is not partitioned */ + const char* part() const { return strstr(basename(), part_suffix); } + + /** @return whether this is a temporary or intermediate table name */ + inline bool is_temporary() const; +}; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Dump the change buffer at startup */ +extern my_bool ibuf_dump; +/** Flag to control insert buffer debugging. */ +extern uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** Shift for spatial status */ +#define SPATIAL_STATUS_SHIFT 12 + +/** Mask to encode/decode spatial status. */ +#define SPATIAL_STATUS_MASK (3U << SPATIAL_STATUS_SHIFT) + +#if SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN +# error SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN +#endif + +/** whether a col is used in spatial index or regular index +Note: the spatial status is part of persistent undo log, +so we should not modify the values in MySQL 5.7 */ +enum spatial_status_t { + /* Unkown status (undo format in 5.7.9) */ + SPATIAL_UNKNOWN = 0, + + /** Not used in gis index. */ + SPATIAL_NONE = 1, + + /** Used in both spatial index and regular index. */ + SPATIAL_MIXED = 2, + + /** Only used in spatial index. */ + SPATIAL_ONLY = 3 +}; + +#endif diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h new file mode 100644 index 00000000..cb8b998f --- /dev/null +++ b/storage/innobase/include/dyn0buf.h @@ -0,0 +1,496 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0buf.h +The dynamically allocated buffer implementation + +Created 2013-03-16 Sunny Bains +*******************************************************/ + +#ifndef dyn0buf_h +#define dyn0buf_h + +#include "mem0mem.h" +#include "dyn0types.h" +#include "ilist.h" + + +/** Class that manages dynamic buffers. It uses a UT_LIST of +mtr_buf_t::block_t instances. We don't use STL containers in +order to avoid the overhead of heap calls. Using a custom memory +allocator doesn't solve the problem either because we have to get +the memory from somewhere. We can't use the block_t::m_data as the +backend for the custom allocator because we would like the data in +the blocks to be contiguous. */ +class mtr_buf_t { +public: + /** SIZE - sizeof(m_node) + sizeof(m_used) */ + enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE + - sizeof(ilist_node<>) + sizeof(uint32_t) }; + + class block_t : public ilist_node<> { + public: + + block_t() + { + compile_time_assert(MAX_DATA_SIZE <= (2 << 15)); + init(); + } + + /** + Gets the number of used bytes in a block. + @return number of bytes used */ + ulint used() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG)); + } + + /** + Gets pointer to the start of data. + @return pointer to data */ + byte* start() + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_data); + } + + /** + @return start of data - non const version */ + byte* begin() + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_data); + } + + /** + @return end of used data - non const version */ + byte* end() + MY_ATTRIBUTE((warn_unused_result)) + { + return(begin() + m_used); + } + + /** + @return start of data - const version */ + const byte* begin() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_data); + } + + /** + @return end of used data - const version */ + const byte* end() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(begin() + m_used); + } + + private: + /** + @return pointer to start of reserved space */ + template <typename Type> + Type push(uint32_t size) + { + Type ptr = reinterpret_cast<Type>(end()); + + m_used += size; + ut_ad(m_used <= uint32_t(MAX_DATA_SIZE)); + + return(ptr); + } + + /** + Grow the stack. */ + void close(const byte* ptr) + { + /* Check that it is within bounds */ + ut_ad(ptr >= begin()); + ut_ad(ptr <= begin() + m_buf_end); + + /* We have done the boundary check above */ + m_used = uint32_t(ptr - begin()); + + ut_ad(m_used <= MAX_DATA_SIZE); + ut_d(m_buf_end = 0); + } + + /** + Initialise the block */ + void init() + { + m_used = 0; + ut_d(m_buf_end = 0); + ut_d(m_magic_n = DYN_BLOCK_MAGIC_N); + } + private: +#ifdef UNIV_DEBUG + /** If opened then this is the buffer end offset, else 0 */ + ulint m_buf_end; + + /** Magic number (DYN_BLOCK_MAGIC_N) */ + ulint m_magic_n; +#endif /* UNIV_DEBUG */ + + /** Storage */ + byte m_data[MAX_DATA_SIZE]; + + /** number of data bytes used in this block; + DYN_BLOCK_FULL_FLAG is set when the block becomes full */ + uint32_t m_used; + + friend class mtr_buf_t; + }; + + typedef sized_ilist<block_t> list_t; + + /** Default constructor */ + mtr_buf_t() + : + m_heap(), + m_size() + { + push_back(&m_first_block); + } + + /** Destructor */ + ~mtr_buf_t() + { + erase(); + } + + /** Reset the buffer vector */ + void erase() + { + if (m_heap != NULL) { + mem_heap_free(m_heap); + m_heap = NULL; + + /* Initialise the list and add the first block. */ + m_list.clear(); + m_list.push_back(m_first_block); + } else { + m_first_block.init(); + ut_ad(m_list.size() == 1); + } + + m_size = 0; + } + + /** + Makes room on top and returns a pointer to a buffer in it. After + copying the elements, the caller must close the buffer using close(). + @param size in bytes of the buffer; MUST be <= MAX_DATA_SIZE! + @return pointer to the buffer */ + byte* open(ulint size) + MY_ATTRIBUTE((warn_unused_result)) + { + ut_ad(size > 0); + ut_ad(size <= MAX_DATA_SIZE); + + block_t* block; + + block = has_space(size) ? back() : add_block(); + + ut_ad(block->m_used <= MAX_DATA_SIZE); + ut_d(block->m_buf_end = block->m_used + size); + + return(block->end()); + } + + /** + Closes the buffer returned by open. + @param ptr end of used space */ + void close(const byte* ptr) + { + ut_ad(!m_list.empty()); + block_t* block = back(); + + m_size -= block->used(); + + block->close(ptr); + + m_size += block->used(); + } + + /** + Makes room on top and returns a pointer to the added element. + The caller must copy the element to the pointer returned. + @param size in bytes of the element + @return pointer to the element */ + template <typename Type> + Type push(uint32_t size) + { + ut_ad(size > 0); + ut_ad(size <= MAX_DATA_SIZE); + + block_t* block; + + block = has_space(size) ? back() : add_block(); + + m_size += size; + + /* See ISO C++03 14.2/4 for why "template" is required. */ + + return(block->template push<Type>(size)); + } + + /** + Pushes n bytes. + @param str string to write + @param len string length */ + void push(const byte* ptr, uint32_t len) + { + while (len > 0) { + uint32_t n_copied = std::min(len, + uint32_t(MAX_DATA_SIZE)); + ::memmove(push<byte*>(n_copied), ptr, n_copied); + + ptr += n_copied; + len -= n_copied; + } + } + + /** + Returns a pointer to an element in the buffer. const version. + @param pos position of element in bytes from start + @return pointer to element */ + template <typename Type> + const Type at(ulint pos) const + { + block_t* block = const_cast<block_t*>( + const_cast<mtr_buf_t*>(this)->find(pos)); + + return(reinterpret_cast<Type>(block->begin() + pos)); + } + + /** + Returns a pointer to an element in the buffer. non const version. + @param pos position of element in bytes from start + @return pointer to element */ + template <typename Type> + Type at(ulint pos) + { + block_t* block = const_cast<block_t*>(find(pos)); + + return(reinterpret_cast<Type>(block->begin() + pos)); + } + + /** + Returns the size of the total stored data. + @return data size in bytes */ + ulint size() const + MY_ATTRIBUTE((warn_unused_result)) + { +#ifdef UNIV_DEBUG + ulint total_size = 0; + + for (list_t::iterator it = m_list.begin(), end = m_list.end(); + it != end; ++it) { + total_size += it->used(); + } + + ut_ad(total_size == m_size); +#endif /* UNIV_DEBUG */ + return(m_size); + } + + /** + Iterate over each block and call the functor. + @return false if iteration was terminated. */ + template <typename Functor> + bool for_each_block(Functor& functor) const + { + for (list_t::iterator it = m_list.begin(), end = m_list.end(); + it != end; ++it) { + + if (!functor(&*it)) { + return false; + } + } + + return(true); + } + + /** + Iterate over each block and call the functor. + @return false if iteration was terminated. */ + template <typename Functor> + bool for_each_block(const Functor& functor) const + { + for (typename list_t::iterator it = m_list.begin(), + end = m_list.end(); + it != end; ++it) { + + if (!functor(&*it)) { + return false; + } + } + + return(true); + } + + /** + Iterate over all the blocks in reverse and call the iterator + @return false if iteration was terminated. */ + template <typename Functor> + bool for_each_block_in_reverse(Functor& functor) const + { + for (list_t::reverse_iterator it = m_list.rbegin(), + end = m_list.rend(); + it != end; ++it) { + + if (!functor(&*it)) { + return false; + } + } + + return(true); + } + + /** + Iterate over all the blocks in reverse and call the iterator + @return false if iteration was terminated. */ + template <typename Functor> + bool for_each_block_in_reverse(const Functor& functor) const + { + for (list_t::reverse_iterator it = m_list.rbegin(), + end = m_list.rend(); + it != end; ++it) { + + if (!functor(&*it)) { + return false; + } + } + + return(true); + } + + /** + @return the first block */ + block_t* front() + MY_ATTRIBUTE((warn_unused_result)) + { + return &m_list.front(); + } + + /** + @return true if m_first_block block was not filled fully */ + bool is_small() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_heap == NULL); + } + + /** @return whether the buffer is empty */ + bool empty() const { return !back()->m_used; } + +private: + // Disable copying + mtr_buf_t(const mtr_buf_t&); + mtr_buf_t& operator=(const mtr_buf_t&); + + /** + Add the block to the end of the list*/ + void push_back(block_t* block) + { + block->init(); + m_list.push_back(*block); + } + + /** @return the last block in the list */ + block_t* back() const + { + return &const_cast<block_t&>(m_list.back()); + } + + /* + @return true if request can be fullfilled */ + bool has_space(ulint size) const + { + return(back()->m_used + size <= MAX_DATA_SIZE); + } + + /* + @return true if request can be fullfilled */ + bool has_space(ulint size) + { + return(back()->m_used + size <= MAX_DATA_SIZE); + } + + /** Find the block that contains the pos. + @param pos absolute offset, it is updated to make it relative + to the block + @return the block containing the pos. */ + block_t* find(ulint& pos) + { + ut_ad(!m_list.empty()); + + for (list_t::iterator it = m_list.begin(), end = m_list.end(); + it != end; ++it) { + + if (pos < it->used()) { + ut_ad(it->used() >= pos); + + return &*it; + } + + pos -= it->used(); + } + + return NULL; + } + + /** + Allocate and add a new block to m_list */ + block_t* add_block() + { + block_t* block; + + if (m_heap == NULL) { + m_heap = mem_heap_create(sizeof(*block)); + } + + block = reinterpret_cast<block_t*>( + mem_heap_alloc(m_heap, sizeof(*block))); + + push_back(block); + + return(block); + } + +private: + /** Heap to use for memory allocation */ + mem_heap_t* m_heap; + + /** Allocated blocks */ + list_t m_list; + + /** Total size used by all blocks */ + ulint m_size; + + /** The default block, should always be the first element. This + is for backwards compatibility and to avoid an extra heap allocation + for small REDO log records */ + block_t m_first_block; +}; + +#endif /* dyn0buf_h */ diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h new file mode 100644 index 00000000..83d0b0d6 --- /dev/null +++ b/storage/innobase/include/dyn0types.h @@ -0,0 +1,39 @@ +/***************************************************************************** + +Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0types.h +The dynamically allocated buffer types and constants + +Created 2013-03-16 Sunny Bains +*******************************************************/ + +#ifndef dyn0types_h +#define dyn0types_h + +/** Value of dyn_block_t::magic_n */ +#define DYN_BLOCK_MAGIC_N 375767 + +/** This is the initial 'payload' size of a dynamic array */ +#define DYN_ARRAY_DATA_SIZE 512 + +/** Flag for dyn_block_t::used that indicates a full block */ +#define DYN_BLOCK_FULL_FLAG 0x1000000UL + +#endif /* dyn0types_h */ diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h new file mode 100644 index 00000000..ebd40924 --- /dev/null +++ b/storage/innobase/include/eval0eval.h @@ -0,0 +1,109 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.h +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef eval0eval_h +#define eval0eval_h + +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/*****************************************************************//** +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node); /*!< in: symbol table node */ +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node); /*!< in: expression */ +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val); /*!< in: value to set */ +/*****************************************************************//** +Gets an integer value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node); /*!< in: expression node */ +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len); /*!< in: string length or UNIV_SQL_NULL */ +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2); /*!< in: node to copy from */ +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a comparison node. +@return the result of the comparison */ +ibool +eval_cmp( +/*=====*/ + func_node_t* cmp_node); /*!< in: comparison node */ + + +#include "eval0eval.ic" + +#endif diff --git a/storage/innobase/include/eval0eval.ic b/storage/innobase/include/eval0eval.ic new file mode 100644 index 00000000..0ea4057f --- /dev/null +++ b/storage/innobase/include/eval0eval.ic @@ -0,0 +1,254 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.ic +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" +#include "rem0cmp.h" +#include "pars0grm.h" + +/*****************************************************************//** +Evaluates a function node. */ +void +eval_func( +/*======*/ + func_node_t* func_node); /*!< in: function node */ +/*****************************************************************//** +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. +@return pointer to allocated buffer */ +byte* +eval_node_alloc_val_buf( +/*====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size); /*!< in: buffer size */ + + +/*****************************************************************//** +Allocates a new buffer if needed. +@return pointer to buffer */ +UNIV_INLINE +byte* +eval_node_ensure_val_buf( +/*=====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /*!< in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + dfield_set_len(dfield, size); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (!data || que_node_get_val_buf_size(node) < size) { + + data = eval_node_alloc_val_buf(node, size); + } + + return(data); +} + +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node) /*!< in: symbol table node */ +{ + + ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + if (sym_node->indirection) { + /* The symbol table node is an alias for a variable or a + column */ + + dfield_copy_data(que_node_get_val(sym_node), + que_node_get_val(sym_node->indirection)); + } +} + +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node) /*!< in: expression */ +{ + if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) { + + eval_sym((sym_node_t*) exp_node); + + return; + } + + eval_func(static_cast<func_node_t*>(exp_node)); +} + +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data == NULL) { + data = eval_node_alloc_val_buf(node, 4); + } + + ut_ad(dfield_get_len(dfield) == 4); + + mach_write_to_4(data, (ulint) val); +} + +/*****************************************************************//** +Gets an integer non-SQL null value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node) /*!< in: expression node */ +{ + const byte* ptr; + dfield_t* dfield; + + dfield = que_node_get_val(node); + ptr = static_cast<byte*>(dfield_get_data(dfield)); + + ut_ad(dfield_get_len(dfield) == 4); + + return((int) mach_read_from_4(ptr)); +} + +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node) /*!< in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + ut_ad(data != NULL); + + return(mach_read_from_1(data)); +} + +/*****************************************************************//** +Sets a iboolean value as the value of a function node. */ +UNIV_INLINE +void +eval_node_set_ibool_val( +/*====================*/ + func_node_t* func_node, /*!< in: function node */ + ibool val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(func_node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data == NULL) { + /* Allocate 1 byte to hold the value */ + + data = eval_node_alloc_val_buf(func_node, 1); + } + + ut_ad(dfield_get_len(dfield) == 1); + + mach_write_to_1(data, val); +} + +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len) /*!< in: string length or UNIV_SQL_NULL */ +{ + byte* data; + + if (len == UNIV_SQL_NULL) { + dfield_set_len(que_node_get_val(node), len); + + return; + } + + data = eval_node_ensure_val_buf(node, len); + + memcpy(data, str, len); +} + +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2) /*!< in: node to copy from */ +{ + dfield_t* dfield2; + + dfield2 = que_node_get_val(node2); + + eval_node_copy_and_alloc_val( + node1, + static_cast<byte*>(dfield_get_data(dfield2)), + dfield_get_len(dfield2)); +} diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h new file mode 100644 index 00000000..71700bb5 --- /dev/null +++ b/storage/innobase/include/eval0proc.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.h +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#ifndef eval0proc_h +#define eval0proc_h + +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an if-statement node. +@return query thread to run next or NULL */ +que_thr_t* +if_step( +/*====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a while-statement node. +@return query thread to run next or NULL */ +que_thr_t* +while_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a for-loop node. +@return query thread to run next or NULL */ +que_thr_t* +for_step( +/*=====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an assignment statement node. +@return query thread to run next or NULL */ +que_thr_t* +assign_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an exit statement node. +@return query thread to run next or NULL */ +que_thr_t* +exit_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a return-statement node. +@return query thread to run next or NULL */ +que_thr_t* +return_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ + +#include "eval0proc.ic" + +#endif diff --git a/storage/innobase/include/eval0proc.ic b/storage/innobase/include/eval0proc.ic new file mode 100644 index 00000000..b0c5f75b --- /dev/null +++ b/storage/innobase/include/eval0proc.ic @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 1998, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.ic +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + proc_node_t* node; + + ut_ad(thr); + + node = static_cast<proc_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_PROC); + + if (thr->prev_node == que_node_get_parent(node)) { + /* Start execution from the first statement in the statement + list */ + + thr->run_node = node->stat_list; + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + func_node_t* node; + + ut_ad(thr); + + node = static_cast<func_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + /* Evaluate the procedure */ + + eval_exp(node); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h new file mode 100644 index 00000000..872053dc --- /dev/null +++ b/storage/innobase/include/fil0crypt.h @@ -0,0 +1,455 @@ +/***************************************************************************** +Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0crypt.h +The low-level file system encryption support functions + +Created 04/01/2015 Jan Lindström +*******************************************************/ + +#ifndef fil0crypt_h +#define fil0crypt_h + +#include "os0event.h" +#include "my_crypt.h" +#include "fil0fil.h" + +/** +* Magic pattern in start of crypt data on page 0 +*/ +#define MAGIC_SZ 6 + +static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = { + 's', 0xE, 0xC, 'R', 'E', 't' }; + +/* This key will be used if nothing else is given */ +#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA + +extern os_event_t fil_crypt_threads_event; + +/** + * CRYPT_SCHEME_UNENCRYPTED + * + * Used as intermediate state when convering a space from unencrypted + * to encrypted + */ +/** + * CRYPT_SCHEME_1 + * + * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths) + * L = AES_ECB(KEY, IV) + * CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE) + */ + +#define CRYPT_SCHEME_1 1 +#define CRYPT_SCHEME_1_IV_LEN 16 +#define CRYPT_SCHEME_UNENCRYPTED 0 + +/* Cached L or key for given key_version */ +struct key_struct +{ + uint key_version; /*!< Version of the key */ + uint key_length; /*!< Key length */ + unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key + (that is L in CRYPT_SCHEME_1) */ +}; + +/** is encryption enabled */ +extern ulong srv_encrypt_tables; + +/** Mutex helper for crypt_data->scheme +@param[in, out] schme encryption scheme +@param[in] exit should we exit or enter mutex ? */ +void +crypt_data_scheme_locker( + st_encryption_scheme* scheme, + int exit); + +struct fil_space_rotate_state_t +{ + time_t start_time; /*!< time when rotation started */ + ulint active_threads; /*!< active threads in space */ + uint32_t next_offset; /*!< next "free" offset */ + uint32_t max_offset; /*!< max offset needing to be rotated */ + uint min_key_version_found; /*!< min key version found but not + rotated */ + lsn_t end_lsn; /*!< max lsn created when rotating this + space */ + bool starting; /*!< initial write of IV */ + bool flushing; /*!< space is being flushed at end of rotate */ +}; + +#ifndef UNIV_INNOCHECKSUM + +struct fil_space_crypt_t : st_encryption_scheme +{ + public: + /** Constructor. Does not initialize the members! + The object is expected to be placed in a buffer that + has been zero-initialized. */ + fil_space_crypt_t( + uint new_type, + uint new_min_key_version, + uint new_key_id, + fil_encryption_t new_encryption) + : st_encryption_scheme(), + min_key_version(new_min_key_version), + encryption(new_encryption), + key_found(0), + rotate_state() + { + key_id = new_key_id; + my_random_bytes(iv, sizeof(iv)); + mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &mutex); + locker = crypt_data_scheme_locker; + type = new_type; + + if (new_encryption == FIL_ENCRYPTION_OFF || + (!srv_encrypt_tables && + new_encryption == FIL_ENCRYPTION_DEFAULT)) { + type = CRYPT_SCHEME_UNENCRYPTED; + } else { + type = CRYPT_SCHEME_1; + min_key_version = key_get_latest_version(); + } + + key_found = min_key_version; + } + + /** Destructor */ + ~fil_space_crypt_t() + { + mutex_free(&mutex); + } + + /** Get latest key version from encryption plugin + @retval key_version or + @retval ENCRYPTION_KEY_VERSION_INVALID if used key_id + is not found from encryption plugin. */ + uint key_get_latest_version(void); + + /** Returns true if key was found from encryption plugin + and false if not. */ + bool is_key_found() const { + return key_found != ENCRYPTION_KEY_VERSION_INVALID; + } + + /** Returns true if tablespace should be encrypted */ + bool should_encrypt() const { + return ((encryption == FIL_ENCRYPTION_ON) || + (srv_encrypt_tables && + encryption == FIL_ENCRYPTION_DEFAULT)); + } + + /** Return true if tablespace is encrypted. */ + bool is_encrypted() const { + return (encryption != FIL_ENCRYPTION_OFF); + } + + /** Return true if default tablespace encryption is used, */ + bool is_default_encryption() const { + return (encryption == FIL_ENCRYPTION_DEFAULT); + } + + /** Return true if tablespace is not encrypted. */ + bool not_encrypted() const { + return (encryption == FIL_ENCRYPTION_OFF); + } + + /** Fill crypt data information to the give page. + It should be called during ibd file creation. + @param[in] flags tablespace flags + @param[in,out] page first page of the tablespace */ + void fill_page0(ulint flags, byte* page); + + /** Write encryption metadata to the first page. + @param[in,out] block first page of the tablespace + @param[in,out] mtr mini-transaction */ + void write_page0(buf_block_t* block, mtr_t* mtr); + + uint min_key_version; // min key version for this space + fil_encryption_t encryption; // Encryption setup + + ib_mutex_t mutex; // mutex protecting following variables + + /** Return code from encryption_key_get_latest_version. + If ENCRYPTION_KEY_VERSION_INVALID encryption plugin + could not find the key and there is no need to call + get_latest_key_version again as keys are read only + at startup. */ + uint key_found; + + fil_space_rotate_state_t rotate_state; +}; + +/** Status info about encryption */ +struct fil_space_crypt_status_t { + ulint space; /*!< tablespace id */ + ulint scheme; /*!< encryption scheme */ + uint min_key_version; /*!< min key version */ + uint current_key_version;/*!< current key version */ + uint keyserver_requests;/*!< no of key requests to key server */ + uint key_id; /*!< current key_id */ + bool rotating; /*!< is key rotation ongoing */ + bool flushing; /*!< is flush at end of rotation ongoing */ + ulint rotate_next_page_number; /*!< next page if key rotating */ + ulint rotate_max_page_number; /*!< max page if key rotating */ +}; + +/** Statistics about encryption key rotation */ +struct fil_crypt_stat_t { + ulint pages_read_from_cache; + ulint pages_read_from_disk; + ulint pages_modified; + ulint pages_flushed; + ulint estimated_iops; +}; + +/********************************************************************* +Init space crypt */ +UNIV_INTERN +void +fil_space_crypt_init(); + +/********************************************************************* +Cleanup space crypt */ +UNIV_INTERN +void +fil_space_crypt_cleanup(); + +/** +Create a fil_space_crypt_t object +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF + +@param[in] key_id Encryption key id +@return crypt object */ +UNIV_INTERN +fil_space_crypt_t* +fil_space_create_crypt_data( + fil_encryption_t encrypt_mode, + uint key_id) + MY_ATTRIBUTE((warn_unused_result)); + +/****************************************************************** +Merge fil_space_crypt_t object +@param[in,out] dst Destination cryp data +@param[in] src Source crypt data */ +UNIV_INTERN +void +fil_space_merge_crypt_data( + fil_space_crypt_t* dst, + const fil_space_crypt_t* src); + +/** Initialize encryption parameters from a tablespace header page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] page first page of the tablespace +@return crypt data from page 0 +@retval NULL if not present or not valid */ +fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** +Free a crypt data object +@param[in,out] crypt_data crypt data to be freed */ +UNIV_INTERN +void +fil_space_destroy_crypt_data( + fil_space_crypt_t **crypt_data); + +/** Amend encryption information from redo log. +@param[in] space tablespace +@param[in] data encryption metadata */ +void fil_crypt_parse(fil_space_t* space, const byte* data); + +/** Encrypt a buffer. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] dst_frame Output buffer +@param[in] use_full_checksum full crc32 algo is used +@return encrypted buffer or NULL */ +UNIV_INTERN +byte* +fil_encrypt_buf( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + const byte* src_frame, + ulint zip_size, + byte* dst_frame, + bool use_full_checksum) + MY_ATTRIBUTE((warn_unused_result)); + +/** +Encrypt a page. + +@param[in] space Tablespace +@param[in] offset Page offset +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ +byte* fil_space_encrypt( + const fil_space_t* space, + ulint offset, + byte* src_frame, + byte* dst_frame) + MY_ATTRIBUTE((warn_unused_result)); + + +/** Decrypt a page. +@param]in] space_id space id +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] physical_size page size +@param[in] fsp_flags Tablespace flags +@param[in,out] src_frame Page to decrypt +@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED +@return true if page decrypted, false if not.*/ +UNIV_INTERN +bool +fil_space_decrypt( + ulint space_id, + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint physical_size, + ulint fsp_flags, + byte* src_frame, + dberr_t* err); + +/****************************************************************** +Decrypt a page +@param[in] space Tablespace +@param[in] tmp_frame Temporary buffer used for decrypting +@param[in,out] src_frame Page to decrypt +@return decrypted page, or original not encrypted page if decryption is +not needed.*/ +UNIV_INTERN +byte* +fil_space_decrypt( + const fil_space_t* space, + byte* tmp_frame, + byte* src_frame) + MY_ATTRIBUTE((warn_unused_result)); + +/** +Calculate post encryption checksum +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] dst_frame Block where checksum is calculated +@return page checksum +not needed. */ +uint32_t +fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame) + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************* +Adjust thread count for key rotation +@param[in] enw_cnt Number of threads to be used */ +UNIV_INTERN +void +fil_crypt_set_thread_cnt( + uint new_cnt); + +/********************************************************************* +Adjust max key age +@param[in] val New max key age */ +UNIV_INTERN +void +fil_crypt_set_rotate_key_age( + uint val); + +/********************************************************************* +Adjust rotation iops +@param[in] val New max roation iops */ +UNIV_INTERN +void +fil_crypt_set_rotation_iops( + uint val); + +/********************************************************************* +Adjust encrypt tables +@param[in] val New setting for innodb-encrypt-tables */ +void fil_crypt_set_encrypt_tables(ulong val); + +/********************************************************************* +Init threads for key rotation */ +UNIV_INTERN +void +fil_crypt_threads_init(); + +/********************************************************************* +Clean up key rotation threads resources */ +UNIV_INTERN +void +fil_crypt_threads_cleanup(); + +/********************************************************************* +Wait for crypt threads to stop accessing space +@param[in] space Tablespace */ +UNIV_INTERN +void +fil_space_crypt_close_tablespace( + const fil_space_t* space); + +/********************************************************************* +Get crypt status for a space (used by information_schema) +@param[in] space Tablespace +@param[out] status Crypt status +return 0 if crypt data present */ +UNIV_INTERN +void +fil_space_crypt_get_status( + const fil_space_t* space, + struct fil_space_crypt_status_t* status); + +/********************************************************************* +Return crypt statistics +@param[out] stat Crypt statistics */ +UNIV_INTERN +void +fil_crypt_total_stat( + fil_crypt_stat_t *stat); + +#include "fil0crypt.ic" +#endif /* !UNIV_INNOCHECKSUM */ + +/** +Verify that post encryption checksum match calculated checksum. +This function should be called only if tablespace contains crypt_data +metadata (this is strong indication that tablespace is encrypted). +Function also verifies that traditional checksum does not match +calculated checksum as if it does page could be valid unencrypted, +encrypted, or corrupted. + +@param[in,out] page page frame (checksum is temporarily modified) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return true if page is encrypted AND OK, false otherwise */ +bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) + MY_ATTRIBUTE((warn_unused_result)); + +/** Add the tablespace to the rotation list if +innodb_encrypt_rotate_key_age is 0 or encryption plugin does +not do key version rotation +@return whether the tablespace should be added to rotation list */ +bool fil_crypt_must_default_encrypt(); + +#endif /* fil0crypt_h */ diff --git a/storage/innobase/include/fil0crypt.ic b/storage/innobase/include/fil0crypt.ic new file mode 100644 index 00000000..cc59b394 --- /dev/null +++ b/storage/innobase/include/fil0crypt.ic @@ -0,0 +1,81 @@ +/***************************************************************************** + +Copyright (c) 2015, 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0crypt.ic +The low-level file system encryption support functions + +Created 04/01/2015 Jan Lindström +*******************************************************/ + +/*******************************************************************//** +Find out whether the page is page encrypted +@return true if page is page encrypted, false if not */ +UNIV_INLINE +bool +fil_page_is_encrypted( +/*==================*/ + const byte *buf) /*!< in: page */ +{ + return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0); +} + +/*******************************************************************//** +Get current encryption mode from crypt_data. +@return string representation */ +UNIV_INLINE +const char * +fil_crypt_get_mode( +/*===============*/ + const fil_space_crypt_t* crypt_data) +{ + switch (crypt_data->encryption) { + case FIL_ENCRYPTION_DEFAULT: + return("Default tablespace encryption mode"); + case FIL_ENCRYPTION_ON: + return("Tablespace encrypted"); + case FIL_ENCRYPTION_OFF: + return("Tablespace not encrypted"); + } + + ut_error; + return ("NULL"); +} + +/*******************************************************************//** +Get current encryption type from crypt_data. +@return string representation */ +UNIV_INLINE +const char * +fil_crypt_get_type( + const fil_space_crypt_t* crypt_data) +{ + ut_ad(crypt_data != NULL); + switch (crypt_data->type) { + case CRYPT_SCHEME_UNENCRYPTED: + return("scheme unencrypted"); + break; + case CRYPT_SCHEME_1: + return("scheme encrypted"); + break; + default: + ut_error; + } + + return ("NULL"); +} diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h new file mode 100644 index 00000000..57b10351 --- /dev/null +++ b/storage/innobase/include/fil0fil.h @@ -0,0 +1,1799 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0fil.h +The low-level file system + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fil0fil_h +#define fil0fil_h + +#include "fsp0types.h" +#include "mach0data.h" +#include "assume_aligned.h" + +#ifndef UNIV_INNOCHECKSUM + +#include "buf0dblwr.h" +#include "hash0hash.h" +#include "log0recv.h" +#include "dict0types.h" +#include "ilist.h" +#include <set> +#include <mutex> + +struct unflushed_spaces_tag_t; +struct rotation_list_tag_t; + +// Forward declaration +extern my_bool srv_use_doublewrite_buf; + +/** Possible values of innodb_flush_method */ +enum srv_flush_t +{ + /** fsync, the default */ + SRV_FSYNC= 0, + /** open log files in O_DSYNC mode */ + SRV_O_DSYNC, + /** do not call os_file_flush() when writing data files, but do flush + after writing to log files */ + SRV_LITTLESYNC, + /** do not flush after writing */ + SRV_NOSYNC, + /** invoke os_file_set_nocache() on data files. This implies using + non-buffered IO but still using fsync, the reason for which is that + some FS do not flush meta-data when unbuffered IO happens */ + SRV_O_DIRECT, + /** do not use fsync() when using direct IO i.e.: it can be set to + avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT. + However, in this case user/DBA should be sure about the integrity of + the meta-data */ + SRV_O_DIRECT_NO_FSYNC +#ifdef _WIN32 + /** Traditional Windows appoach to open all files without caching, + and do FileFlushBuffers() */ + ,SRV_ALL_O_DIRECT_FSYNC +#endif +}; + +/** innodb_flush_method */ +extern ulong srv_file_flush_method; + +/** Undo tablespaces starts with space_id. */ +extern ulint srv_undo_space_id_start; +/** The number of UNDO tablespaces that are open and ready to use. */ +extern ulint srv_undo_tablespaces_open; + +/** Check whether given space id is undo tablespace id +@param[in] space_id space id to check +@return true if it is undo tablespace else false. */ +inline bool srv_is_undo_tablespace(ulint space_id) +{ + return srv_undo_space_id_start > 0 && + space_id >= srv_undo_space_id_start && + space_id < srv_undo_space_id_start + srv_undo_tablespaces_open; +} + +class page_id_t; + +/** Structure containing encryption specification */ +struct fil_space_crypt_t; + +/** File types */ +enum fil_type_t { + /** temporary tablespace (temporary undo log or tables) */ + FIL_TYPE_TEMPORARY, + /** a tablespace that is being imported (no logging until finished) */ + FIL_TYPE_IMPORT, + /** persistent tablespace (for system, undo log or tables) */ + FIL_TYPE_TABLESPACE, +}; + +struct fil_node_t; + +/** Structure to store first and last value of range */ +struct range_t +{ + uint32_t first; + uint32_t last; +}; + +/** Sort the range based on first value of the range */ +struct range_compare +{ + bool operator() (const range_t lhs, const range_t rhs) const + { + return lhs.first < rhs.first; + } +}; + +using range_set_t= std::set<range_t, range_compare>; +/** Range to store the set of ranges of integers */ +class range_set +{ +private: + range_set_t ranges; + + range_set_t::iterator find(uint32_t value) const + { + auto r_offset= ranges.lower_bound({value, value}); + const auto r_end= ranges.end(); + if (r_offset != r_end); + else if (empty()) + return r_end; + else + r_offset= std::prev(r_end); + if (r_offset->first <= value && r_offset->last >= value) + return r_offset; + return r_end; + } +public: + /** Merge the current range with previous range. + @param[in] range range to be merged + @param[in] prev_range range to be merged with next */ + void merge_range(range_set_t::iterator range, + range_set_t::iterator prev_range) + { + if (range->first != prev_range->last + 1) + return; + + /* Merge the current range with previous range */ + range_t new_range {prev_range->first, range->last}; + ranges.erase(prev_range); + ranges.erase(range); + ranges.emplace(new_range); + } + + /** Split the range and add two more ranges + @param[in] range range to be split + @param[in] value Value to be removed from range */ + void split_range(range_set_t::iterator range, uint32_t value) + { + range_t split1{range->first, value - 1}; + range_t split2{value + 1, range->last}; + + /* Remove the existing element */ + ranges.erase(range); + + /* Insert the two elements */ + ranges.emplace(split1); + ranges.emplace(split2); + } + + /** Remove the value with the given range + @param[in,out] range range to be changed + @param[in] value value to be removed */ + void remove_within_range(range_set_t::iterator range, uint32_t value) + { + range_t new_range{range->first, range->last}; + if (value == range->first) + { + if (range->first == range->last) + { + ranges.erase(range); + return; + } + else + new_range.first++; + } + else if (value == range->last) + new_range.last--; + else if (range->first < value && range->last > value) + return split_range(range, value); + + ranges.erase(range); + ranges.emplace(new_range); + } + + /** Remove the value from the ranges. + @param[in] value Value to be removed. */ + void remove_value(uint32_t value) + { + if (empty()) + return; + range_t new_range {value, value}; + range_set_t::iterator range= ranges.lower_bound(new_range); + if (range == ranges.end()) + return remove_within_range(std::prev(range), value); + + if (range->first > value && range != ranges.begin()) + /* Iterate the previous ranges to delete */ + return remove_within_range(std::prev(range), value); + return remove_within_range(range, value); + } + /** Add the value within the existing range + @param[in] range range to be modified + @param[in] value value to be added */ + range_set_t::iterator add_within_range(range_set_t::iterator range, + uint32_t value) + { + if (range->first <= value && range->last >= value) + return range; + + range_t new_range{range->first, range->last}; + if (range->last + 1 == value) + new_range.last++; + else if (range->first - 1 == value) + new_range.first--; + else return ranges.end(); + ranges.erase(range); + return ranges.emplace(new_range).first; + } + /** Add the range in the ranges set + @param[in] new_range range to be added */ + void add_range(range_t new_range) + { + auto r_offset= ranges.lower_bound(new_range); + auto r_begin= ranges.begin(); + auto r_end= ranges.end(); + if (!ranges.size()) + { +new_range: + ranges.emplace(new_range); + return; + } + + if (r_offset == r_end) + { + /* last range */ + if (add_within_range(std::prev(r_offset), new_range.first) == r_end) + goto new_range; + } + else if (r_offset == r_begin) + { + /* First range */ + if (add_within_range(r_offset, new_range.first) == r_end) + goto new_range; + } + else if (r_offset->first - 1 == new_range.first) + { + /* Change starting of the existing range */ + auto r_value= add_within_range(r_offset, new_range.first); + if (r_value != ranges.begin()) + merge_range(r_value, std::prev(r_value)); + } + else + { + /* previous range last_value alone */ + if (add_within_range(std::prev(r_offset), new_range.first) == r_end) + goto new_range; + } + } + + /** Add the value in the ranges + @param[in] value value to be added */ + void add_value(uint32_t value) + { + range_t new_range{value, value}; + add_range(new_range); + } + + bool remove_if_exists(uint32_t value) + { + auto r_offset= find(value); + if (r_offset != ranges.end()) + { + remove_within_range(r_offset, value); + return true; + } + return false; + } + + bool contains(uint32_t value) const + { + return find(value) != ranges.end(); + } + + ulint size() { return ranges.size(); } + void clear() { ranges.clear(); } + bool empty() const { return ranges.empty(); } + typename range_set_t::iterator begin() { return ranges.begin(); } + typename range_set_t::iterator end() { return ranges.end(); } +}; +#endif + +/** Tablespace or log data space */ +#ifndef UNIV_INNOCHECKSUM +struct fil_io_t +{ + /** error code */ + dberr_t err; + /** file; node->space->release() must follow IORequestRead call */ + fil_node_t *node; +}; + +/** Tablespace encryption mode */ +enum fil_encryption_t +{ + /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */ + FIL_ENCRYPTION_DEFAULT, + /** Encrypted */ + FIL_ENCRYPTION_ON, + /** Not encrypted */ + FIL_ENCRYPTION_OFF +}; + +struct fil_space_t final : + ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t> +#else +struct fil_space_t final +#endif +{ +#ifndef UNIV_INNOCHECKSUM + friend fil_node_t; + ulint id; /*!< space id */ + hash_node_t hash; /*!< hash chain node */ + char* name; /*!< Tablespace name */ + lsn_t max_lsn; + /*!< LSN of the most recent + fil_names_write_if_was_clean(). + Reset to 0 by fil_names_clear(). + Protected by log_sys.mutex. + If and only if this is nonzero, the + tablespace will be in named_spaces. */ + /** whether undo tablespace truncation is in progress */ + bool is_being_truncated; + fil_type_t purpose;/*!< purpose */ + UT_LIST_BASE_NODE_T(fil_node_t) chain; + /*!< base node for the file chain */ + uint32_t size; /*!< tablespace file size in pages; + 0 if not known yet */ + uint32_t size_in_header; + /* FSP_SIZE in the tablespace header; + 0 if not known yet */ + uint32_t free_len; + /*!< length of the FSP_FREE list */ + uint32_t free_limit; + /*!< contents of FSP_FREE_LIMIT */ + uint32_t recv_size; + /*!< recovered tablespace size in pages; + 0 if no size change was read from the redo log, + or if the size change was implemented */ + uint32_t n_reserved_extents; + /*!< number of reserved free extents for + ongoing operations like B-tree page split */ +private: + /** the committed size of the tablespace in pages */ + Atomic_relaxed<uint32_t> committed_size; + /** Number of pending operations on the file. + The tablespace cannot be freed while (n_pending & PENDING) != 0. */ + std::atomic<uint32_t> n_pending; + /** Flag in n_pending that indicates that the tablespace is being + deleted, and no further operations should be performed */ + static constexpr uint32_t STOPPING= 1U << 31; + /** Flag in n_pending that indicates that the tablespace is a candidate + for being closed, and fil_node_t::is_open() can only be trusted after + acquiring fil_system.mutex and resetting the flag */ + static constexpr uint32_t CLOSING= 1U << 30; + /** Flag in n_pending that indicates that the tablespace needs fsync(). + This must be the least significant flag bit; @see release_flush() */ + static constexpr uint32_t NEEDS_FSYNC= 1U << 29; + /** The reference count */ + static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC); +public: + rw_lock_t latch; /*!< latch protecting the file space storage + allocation */ + UT_LIST_NODE_T(fil_space_t) named_spaces; + /*!< list of spaces for which FILE_MODIFY + records have been issued */ + UT_LIST_NODE_T(fil_space_t) space_list; + /*!< list of all spaces */ + + /** MariaDB encryption data */ + fil_space_crypt_t* crypt_data; + + /** Checks that this tablespace in a list of unflushed tablespaces. */ + bool is_in_unflushed_spaces; + + /** Checks that this tablespace needs key rotation. */ + bool is_in_default_encrypt; + + /** True if the device this filespace is on supports atomic writes */ + bool atomic_write_supported; + + /** True if file system storing this tablespace supports + punch hole */ + bool punch_hole; + + /** mutex to protect freed ranges */ + std::mutex freed_range_mutex; + + /** Variables to store freed ranges. This can be used to write + zeroes/punch the hole in files. Protected by freed_mutex */ + range_set freed_ranges; + + /** Stores last page freed lsn. Protected by freed_mutex */ + lsn_t last_freed_lsn; + + ulint magic_n;/*!< FIL_SPACE_MAGIC_N */ + + /** @return whether doublewrite buffering is needed */ + bool use_doublewrite() const + { + return !atomic_write_supported && srv_use_doublewrite_buf && + buf_dblwr.is_initialised(); + } + + /** Append a file to the chain of files of a space. + @param[in] name file name of a file that is not open + @param[in] handle file handle, or OS_FILE_CLOSED + @param[in] size file size in entire database pages + @param[in] is_raw whether this is a raw device + @param[in] atomic_write true if atomic write could be enabled + @param[in] max_pages maximum number of pages in file, + or UINT32_MAX for unlimited + @return file object */ + fil_node_t* add(const char* name, pfs_os_file_t handle, + uint32_t size, bool is_raw, bool atomic_write, + uint32_t max_pages = UINT32_MAX); +#ifdef UNIV_DEBUG + /** Assert that the mini-transaction is compatible with + updating an allocation bitmap page. + @param[in] mtr mini-transaction */ + void modify_check(const mtr_t& mtr) const; +#endif /* UNIV_DEBUG */ + + /** Try to reserve free extents. + @param[in] n_free_now current number of free extents + @param[in] n_to_reserve number of extents to reserve + @return whether the reservation succeeded */ + bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve) + { + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + if (n_reserved_extents + n_to_reserve > n_free_now) { + return false; + } + + n_reserved_extents += n_to_reserve; + return true; + } + + /** Release the reserved free extents. + @param[in] n_reserved number of reserved extents */ + void release_free_extents(uint32_t n_reserved) + { + if (!n_reserved) return; + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + ut_a(n_reserved_extents >= n_reserved); + n_reserved_extents -= n_reserved; + } + + /** Rename a file. + @param[in] name table name after renaming + @param[in] path tablespace file name after renaming + @param[in] log whether to write redo log + @param[in] replace whether to ignore the existence of path + @return error code + @retval DB_SUCCESS on success */ + dberr_t rename(const char* name, const char* path, bool log, + bool replace = false); + + /** Note that the tablespace has been imported. + Initially, purpose=FIL_TYPE_IMPORT so that no redo log is + written while the space ID is being updated in each page. */ + inline void set_imported(); + + /** @return whether the storage device is rotational (HDD, not SSD) */ + inline bool is_rotational() const; + + /** Open each file. Never invoked on .ibd files. + @param create_new_db whether to skip the call to fil_node_t::read_page0() + @return whether all files were opened */ + bool open(bool create_new_db); + /** Close each file. Only invoked on fil_system.temp_space. */ + void close(); + + /** Note that operations on the tablespace must stop or can resume */ + inline void set_stopping(bool stopping); + +private: + MY_ATTRIBUTE((warn_unused_result)) + /** Try to acquire a tablespace reference. + @return the old reference count (if STOPPING is set, it was not acquired) */ + uint32_t acquire_low() + { + uint32_t n= 0; + while (!n_pending.compare_exchange_strong(n, n + 1, + std::memory_order_acquire, + std::memory_order_relaxed) && + !(n & STOPPING)); + return n; + } +public: + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference. + @return whether a tablespace reference was successfully acquired */ + inline bool acquire_if_not_stopped(); + + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference for I/O. + @return whether the file is usable */ + bool acquire() + { + uint32_t n= acquire_low(); + if (UNIV_LIKELY(!(n & (STOPPING | CLOSING)))) + return true; + return UNIV_LIKELY(!(n & STOPPING)) && prepare(); + } + + /** Acquire another tablespace reference for I/O. */ + inline void reacquire(); + + /** Release a tablespace reference. + @return whether this was the last reference */ + bool release() + { + uint32_t n= n_pending.fetch_sub(1, std::memory_order_release); + ut_ad(n & PENDING); + return (n & PENDING) == 1; + } + + /** Clear the NEEDS_FSYNC flag */ + void clear_flush() + { n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); } + +private: + /** @return pending operations (and flags) */ + uint32_t pending()const { return n_pending.load(std::memory_order_acquire); } +public: + /** @return whether close() of the file handle has been requested */ + bool is_closing() const { return pending() & CLOSING; } + /** @return whether the tablespace is going to be dropped */ + bool is_stopping() const { return pending() & STOPPING; } + /** @return number of pending operations */ + bool is_ready_to_close() const + { return (pending() & (PENDING | CLOSING)) == CLOSING; } + /** @return whether fsync() or similar is needed */ + bool needs_flush() const { return pending() & NEEDS_FSYNC; } + /** @return whether fsync() or similar is needed, and the tablespace is + not being dropped */ + bool needs_flush_not_stopping() const + { return (pending() & (NEEDS_FSYNC | STOPPING)) == NEEDS_FSYNC; } + + uint32_t referenced() const { return pending() & PENDING; } +private: + MY_ATTRIBUTE((warn_unused_result)) + /** Prepare to close the file handle. + @return number of pending operations, possibly with NEEDS_FSYNC flag */ + uint32_t set_closing() + { + return n_pending.fetch_or(CLOSING, std::memory_order_acquire) & + (PENDING | NEEDS_FSYNC); + } + +public: + /** Try to close a file to adhere to the innodb_open_files limit. + @param print_info whether to diagnose why a file cannot be closed + @return whether a file was closed */ + static bool try_to_close(bool print_info); + + /** Close all tablespace files at shutdown */ + static void close_all(); + + /** @return last_freed_lsn */ + lsn_t get_last_freed_lsn() { return last_freed_lsn; } + /** Update last_freed_lsn */ + void update_last_freed_lsn(lsn_t lsn) + { + std::lock_guard<std::mutex> freed_lock(freed_range_mutex); + last_freed_lsn= lsn; + } + + /** Note that the file will need fsync(). + @return whether this needs to be added to fil_system.unflushed_spaces */ + bool set_needs_flush() + { + uint32_t n= 1; + while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + ut_ad(n & PENDING); + if (n & (NEEDS_FSYNC | STOPPING)) + return false; + } + + return true; + } + + /** Clear all freed ranges for undo tablespace when InnoDB + encounters TRIM redo log record */ + void clear_freed_ranges() + { + std::lock_guard<std::mutex> freed_lock(freed_range_mutex); + freed_ranges.clear(); + } +#endif /* !UNIV_INNOCHECKSUM */ + /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; + check fsp0types.h to more info about flags. */ + ulint flags; + + /** Determine if full_crc32 is used for a data file + @param[in] flags tablespace flags (FSP_SPACE_FLAGS) + @return whether the full_crc32 algorithm is active */ + static bool full_crc32(ulint flags) { + return flags & FSP_FLAGS_FCRC32_MASK_MARKER; + } + /** @return whether innodb_checksum_algorithm=full_crc32 is active */ + bool full_crc32() const { return full_crc32(flags); } + /** Determine the logical page size. + @param flags tablespace flags (FSP_FLAGS) + @return the logical page size + @retval 0 if the flags are invalid */ + static unsigned logical_size(ulint flags) { + + ulint page_ssize = 0; + + if (full_crc32(flags)) { + page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + } else { + page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + } + + switch (page_ssize) { + case 3: return 4096; + case 4: return 8192; + case 5: + { ut_ad(full_crc32(flags)); return 16384; } + case 0: + { ut_ad(!full_crc32(flags)); return 16384; } + case 6: return 32768; + case 7: return 65536; + default: return 0; + } + } + /** Determine the ROW_FORMAT=COMPRESSED page size. + @param flags tablespace flags (FSP_FLAGS) + @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + static unsigned zip_size(ulint flags) { + + if (full_crc32(flags)) { + return 0; + } + + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0; + } + /** Determine the physical page size. + @param flags tablespace flags (FSP_FLAGS) + @return the physical page size */ + static unsigned physical_size(ulint flags) { + + if (full_crc32(flags)) { + return logical_size(flags); + } + + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize + : unsigned(srv_page_size); + } + /** @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + unsigned zip_size() const { return zip_size(flags); } + /** @return the physical page size */ + unsigned physical_size() const { return physical_size(flags); } + /** Check whether the compression enabled in tablespace. + @param[in] flags tablespace flags */ + static bool is_compressed(ulint flags) { + + if (full_crc32(flags)) { + ulint algo = FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO( + flags); + DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST); + return algo > 0; + } + + return FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + } + /** @return whether the compression enabled for the tablespace. */ + bool is_compressed() const { return is_compressed(flags); } + + /** Get the compression algorithm for full crc32 format. + @param[in] flags tablespace flags + @return algorithm type of tablespace */ + static ulint get_compression_algo(ulint flags) + { + return full_crc32(flags) + ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) + : 0; + } + /** @return the page_compressed algorithm + @retval 0 if not page_compressed */ + ulint get_compression_algo() const { + return fil_space_t::get_compression_algo(flags); + } + /** Determine if the page_compressed page contains an extra byte + for exact compressed stream length + @param[in] flags tablespace flags + @return whether the extra byte is needed */ + static bool full_crc32_page_compressed_len(ulint flags) + { + DBUG_ASSERT(full_crc32(flags)); + switch (get_compression_algo(flags)) { + case PAGE_LZ4_ALGORITHM: + case PAGE_LZO_ALGORITHM: + case PAGE_SNAPPY_ALGORITHM: + return true; + } + return false; + } + + /** Whether the full checksum matches with non full checksum flags. + @param[in] flags flags present + @param[in] expected expected flags + @return true if it is equivalent */ + static bool is_flags_full_crc32_equal(ulint flags, ulint expected) + { + ut_ad(full_crc32(flags)); + ulint fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + + if (full_crc32(expected)) { + /* The data file may have been created with a + different innodb_compression_algorithm. But + we only support one innodb_page_size for all files. */ + return fcrc32_psize + == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected); + } + + ulint non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(expected); + + if (!non_fcrc32_psize) { + if (fcrc32_psize != 5) { + return false; + } + } else if (fcrc32_psize != non_fcrc32_psize) { + return false; + } + + return true; + } + /** Whether old tablespace flags match full_crc32 flags. + @param[in] flags flags present + @param[in] expected expected flags + @return true if it is equivalent */ + static bool is_flags_non_full_crc32_equal(ulint flags, ulint expected) + { + ut_ad(!full_crc32(flags)); + + if (!full_crc32(expected)) { + return false; + } + + ulint non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + ulint fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE( + expected); + + if (!non_fcrc32_psize) { + if (fcrc32_psize != 5) { + return false; + } + } else if (fcrc32_psize != non_fcrc32_psize) { + return false; + } + + return true; + } + /** Whether both fsp flags are equivalent */ + static bool is_flags_equal(ulint flags, ulint expected) + { + if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED))) { + return true; + } + + return full_crc32(flags) + ? is_flags_full_crc32_equal(flags, expected) + : is_flags_non_full_crc32_equal(flags, expected); + } + /** Validate the tablespace flags for full crc32 format. + @param[in] flags the content of FSP_SPACE_FLAGS + @return whether the flags are correct in full crc32 format */ + static bool is_fcrc32_valid_flags(ulint flags) + { + ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER); + const ulint page_ssize = physical_size(flags); + if (page_ssize < 3 || page_ssize & 8) { + return false; + } + + flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + + return flags <= PAGE_ALGORITHM_LAST; + } + /** Validate the tablespace flags. + @param[in] flags content of FSP_SPACE_FLAGS + @param[in] is_ibd whether this is an .ibd file + (not system tablespace) + @return whether the flags are correct. */ + static bool is_valid_flags(ulint flags, bool is_ibd) + { + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", + return false;); + + if (full_crc32(flags)) { + return is_fcrc32_valid_flags(flags); + } + + if (flags == 0) { + return true; + } + + if (flags & ~FSP_FLAGS_MASK) { + return false; + } + + if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) + == FSP_FLAGS_MASK_ATOMIC_BLOBS) { + /* If the "atomic blobs" flag (indicating + ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag + is set, then the "post Antelope" + (ROW_FORMAT!=REDUNDANT) flag must also be set. */ + return false; + } + + /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag + of MySQL 5.6 and MariaDB 10.0, which we ignore. + In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20, + bits 10..14 would be nonzero 0bsssaa where sss is + nonzero PAGE_SSIZE (3, 4, 6, or 7) + and aa is ATOMIC_WRITES (not 0b11). */ + if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) { + return false; + } + + const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { + /* the page_size is not between 4k and 64k; + 16k should be encoded as 0, not 5 */ + return false; + } + + const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + if (zssize == 0) { + /* not ROW_FORMAT=COMPRESSED */ + } else if (zssize > (ssize ? ssize : 5)) { + /* Invalid KEY_BLOCK_SIZE */ + return false; + } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { + /* both these flags should be set for + ROW_FORMAT=COMPRESSED */ + return false; + } + + /* The flags do look valid. But, avoid misinterpreting + buggy MariaDB 10.1 format flags for + PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3} + as valid-looking PAGE_SSIZE if this is known to be + an .ibd file and we are using the default innodb_page_size=16k. */ + return(ssize == 0 || !is_ibd + || srv_page_size != UNIV_PAGE_SIZE_ORIG); + } + +#ifndef UNIV_INNOCHECKSUM + MY_ATTRIBUTE((warn_unused_result)) + /** Create a tablespace in fil_system. + @param name tablespace name + @param id tablespace identifier + @param flags tablespace flags + @param purpose tablespace purpose + @param crypt_data encryption information + @param mode encryption mode + @return pointer to created tablespace, to be filled in with add() + @retval nullptr on failure (such as when the same tablespace exists) */ + static fil_space_t *create(const char *name, ulint id, ulint flags, + fil_type_t purpose, fil_space_crypt_t *crypt_data, + fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT); + + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference. + @param id tablespace identifier + @return tablespace + @retval nullptr if the tablespace is missing or inaccessible */ + static fil_space_t *get(ulint id); + + /** Add/remove the free page in the freed ranges list. + @param[in] offset page number to be added + @param[in] free true if page to be freed */ + void free_page(uint32_t offset, bool add=true) + { + std::lock_guard<std::mutex> freed_lock(freed_range_mutex); + if (add) + return freed_ranges.add_value(offset); + + if (freed_ranges.empty()) + return; + + return freed_ranges.remove_value(offset); + } + + /** Add the range of freed pages */ + void add_free_ranges(range_set ranges) + { + std::lock_guard<std::mutex> freed_lock(freed_range_mutex); + freed_ranges= std::move(ranges); + } + + /** Add the set of freed page ranges */ + void add_free_range(const range_t range) + { + std::lock_guard<std::mutex> freed_lock(freed_range_mutex); + freed_ranges.add_range(range); + } + + /** Set the tablespace size in pages */ + void set_sizes(uint32_t s) + { + ut_ad(id ? !size : (size >= s)); + size= s; committed_size= s; + } + + /** Update committed_size in mtr_t::commit() */ + void set_committed_size() + { + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + committed_size= size; + } + + /** @return the last persisted page number */ + uint32_t last_page_number() const { return committed_size - 1; } + + /** @return the size in pages (0 if unreadable) */ + inline uint32_t get_size(); + + /** Read or write data. + @param type I/O context + @param offset offset in bytes + @param len number of bytes + @param buf the data to be read or written + @param bpage buffer block (for type.is_async() completion callback) + @return status and file descriptor */ + fil_io_t io(const IORequest &type, os_offset_t offset, size_t len, + void *buf, buf_page_t *bpage= nullptr); + /** Flush pending writes from the file system cache to the file. */ + template<bool have_reference> inline void flush(); + /** Flush pending writes from the file system cache to the file. */ + void flush_low(); + + /** Read the first page of a data file. + @return whether the page was found valid */ + bool read_page0(); + + /** Determine the next tablespace for encryption key rotation. + @param space current tablespace (nullptr to start from the beginning) + @param recheck whether the removal condition needs to be rechecked after + encryption parameters were changed + @param encrypt expected state of innodb_encrypt_tables + @return the next tablespace + @retval nullptr upon reaching the end of the iteration */ + static inline fil_space_t *next(fil_space_t *space, bool recheck, + bool encrypt); + +private: + /** @return whether the file is usable for io() */ + ATTRIBUTE_COLD bool prepare(bool have_mutex= false); +#endif /*!UNIV_INNOCHECKSUM */ +}; + +#ifndef UNIV_INNOCHECKSUM +/** Value of fil_space_t::magic_n */ +#define FIL_SPACE_MAGIC_N 89472 + +/** File node of a tablespace or the log data space */ +struct fil_node_t final +{ + /** tablespace containing this file */ + fil_space_t* space; + /** file name; protected by fil_system.mutex and log_sys.mutex. */ + char* name; + /** file handle (valid if is_open) */ + pfs_os_file_t handle; + /** whether the file actually is a raw device or disk partition */ + bool is_raw_disk; + /** whether the file is on non-rotational media (SSD) */ + bool on_ssd; + /** size of the file in database pages (0 if not known yet); + the possible last incomplete megabyte may be ignored + if space->id == 0 */ + uint32_t size; + /** initial size of the file in database pages; + FIL_IBD_FILE_INITIAL_SIZE by default */ + uint32_t init_size; + /** maximum size of the file in database pages (0 if unlimited) */ + uint32_t max_size; + /** whether the file is currently being extended */ + Atomic_relaxed<bool> being_extended; + /** link to other files in this tablespace */ + UT_LIST_NODE_T(fil_node_t) chain; + + /** whether this file could use atomic write (data file) */ + bool atomic_write; + + /** Filesystem block size */ + ulint block_size; + + /** FIL_NODE_MAGIC_N */ + ulint magic_n; + + /** @return whether this file is open */ + bool is_open() const + { + return(handle != OS_FILE_CLOSED); + } + + /** Read the first page of a data file. + @return whether the page was found valid */ + bool read_page0(); + + /** Determine some file metadata when creating or reading the file. + @param file the file that is being created, or OS_FILE_CLOSED */ + void find_metadata(os_file_t file = OS_FILE_CLOSED +#ifndef _WIN32 + , struct stat* statbuf = NULL +#endif + ); + + /** Close the file handle. */ + void close(); + /** Same as close() but returns file handle instead of closing it. */ + pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result)); + /** Prepare to free a file from fil_system. + @param detach_handle whether to detach instead of closing a handle + @return detached handle or OS_FILE_CLOSED */ + inline pfs_os_file_t close_to_free(bool detach_handle= false); + + /** Update the data structures on write completion */ + inline void complete_write(); + +private: + /** Does stuff common for close() and detach() */ + void prepare_to_close_or_detach(); +}; + +/** Value of fil_node_t::magic_n */ +#define FIL_NODE_MAGIC_N 89389 + +inline void fil_space_t::set_imported() +{ + ut_ad(purpose == FIL_TYPE_IMPORT); + purpose= FIL_TYPE_TABLESPACE; + UT_LIST_GET_FIRST(chain)->find_metadata(); +} + +inline bool fil_space_t::is_rotational() const +{ + for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + if (!node->on_ssd) + return true; + return false; +} + +/** Common InnoDB file extensions */ +enum ib_extention { + NO_EXT = 0, + IBD = 1, + ISL = 2, + CFG = 3 +}; +extern const char* dot_ext[]; +#define DOT_IBD dot_ext[IBD] +#define DOT_ISL dot_ext[ISL] +#define DOT_CFG dot_ext[CFG] + +/** When mysqld is run, the default directory "." is the mysqld datadir, +but in the MySQL Embedded Server Library and mysqlbackup it is not the default +directory, and we must set the base file path explicitly */ +extern const char* fil_path_to_mysql_datadir; +#else +# include "univ.i" +#endif /* !UNIV_INNOCHECKSUM */ + +/** Initial size of a single-table tablespace in pages */ +#define FIL_IBD_FILE_INITIAL_SIZE 4U + +/** 'null' (undefined) page offset in the context of file spaces */ +#define FIL_NULL ULINT32_UNDEFINED + + +#define FIL_ADDR_PAGE 0U /* first in address is the page offset */ +#define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/ +#define FIL_ADDR_SIZE 6U /* address size is 6 bytes */ + +/** File space address */ +struct fil_addr_t { + /** page number within a tablespace */ + uint32_t page; + /** byte offset within the page */ + uint16_t boffset; +}; + +/** The byte offsets on a file page for various variables @{ */ +#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the + page belongs to (== 0) but in later + versions the 'new' checksum of the + page */ +#define FIL_PAGE_OFFSET 4U /*!< page offset inside space */ +#define FIL_PAGE_PREV 8U /*!< if there is a 'natural' + predecessor of the page, its + offset. Otherwise FIL_NULL. + This field is not set on BLOB + pages, which are stored as a + singly-linked list. See also + FIL_PAGE_NEXT. */ +#define FIL_PAGE_NEXT 12U /*!< if there is a 'natural' successor + of the page, its offset. + Otherwise FIL_NULL. + B-tree index pages + (FIL_PAGE_TYPE contains FIL_PAGE_INDEX) + on the same PAGE_LEVEL are maintained + as a doubly linked list via + FIL_PAGE_PREV and FIL_PAGE_NEXT + in the collation order of the + smallest user record on each page. */ +#define FIL_PAGE_LSN 16U /*!< lsn of the end of the newest + modification log record to the page */ +#define FIL_PAGE_TYPE 24U /*!< file page type: FIL_PAGE_INDEX,..., + 2 bytes. + + The contents of this field can only + be trusted in the following case: + if the page is an uncompressed + B-tree index page, then it is + guaranteed that the value is + FIL_PAGE_INDEX. + The opposite does not hold. + + In tablespaces created by + MySQL/InnoDB 5.1.7 or later, the + contents of this field is valid + for all uncompressed pages. */ + +/** For the first page in a system tablespace data file(ibdata*, not *.ibd): +the file has been flushed to disk at least up to this lsn +For other pages: 32-bit key version used to encrypt the page + 32-bit checksum +or 64 bites of zero if no encryption */ +#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U + +/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */ +#define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + +/** Start of the page_compressed content */ +#define FIL_PAGE_COMP_ALGO FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + +/** starting from 4.1.x this contains the space id of the page */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U + +#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +#define FIL_PAGE_DATA 38U /*!< start of the data on the page */ + +/** 32-bit key version used to encrypt the page in full_crc32 format. +For non-encrypted page, it contains 0. */ +#define FIL_PAGE_FCRC32_KEY_VERSION 0 + +/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */ +/** Number of bytes used to store actual payload data size on +page_compressed pages when not using full_crc32. */ +#define FIL_PAGE_COMP_SIZE 0 + +/** Number of bytes for FIL_PAGE_COMP_SIZE */ +#define FIL_PAGE_COMP_METADATA_LEN 2 + +/** Number of bytes used to store actual compression method +for encrypted tables when not using full_crc32. */ +#define FIL_PAGE_ENCRYPT_COMP_ALGO 2 + +/** Extra header size for encrypted page_compressed pages when +not using full_crc32 */ +#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN 4 +/* @} */ + +/** File page trailer @{ */ +#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used + to store the page checksum, the + last 4 bytes should be identical + to the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */ + +/** Store the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_FCRC32_END_LSN 8 + +/** Store crc32 checksum at the end of the page */ +#define FIL_PAGE_FCRC32_CHECKSUM 4 +/* @} */ + +/** File page types (values of FIL_PAGE_TYPE) @{ */ +/** page_compressed, encrypted=YES (not used for full_crc32) */ +constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401; +/** page_compressed (not used for full_crc32) */ +constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354; +/** B-tree index page */ +constexpr uint16_t FIL_PAGE_INDEX= 17855; +/** R-tree index page (SPATIAL INDEX) */ +constexpr uint16_t FIL_PAGE_RTREE= 17854; +/** Undo log page */ +constexpr uint16_t FIL_PAGE_UNDO_LOG= 2; +/** Index node (of file-in-file metadata) */ +constexpr uint16_t FIL_PAGE_INODE= 3; +/** Insert buffer free list */ +constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4; +/** Freshly allocated page */ +constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0; +/** Change buffer bitmap (pages n*innodb_page_size+1) */ +constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5; +/** System page */ +constexpr uint16_t FIL_PAGE_TYPE_SYS= 6; +/** Transaction system data */ +constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7; +/** Tablespace header (page 0) */ +constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8; +/** Extent descriptor page (pages n*innodb_page_size, except 0) */ +constexpr uint16_t FIL_PAGE_TYPE_XDES= 9; +/** Uncompressed BLOB page */ +constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10; +/** First ROW_FORMAT=COMPRESSED BLOB page */ +constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11; +/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */ +constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12; +/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this +value when flushing pages. */ +constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13; + +/* File page types introduced in MySQL 5.7, not supported in MariaDB */ +//constexpr uint16_t FIL_PAGE_COMPRESSED = 14; +//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15; +//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16; +//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17; +/** Clustered index root page after instant ADD COLUMN */ +constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18; + +/** Used by i_s.cc to index into the text description. +Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */ +constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN; + +/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format. +If the flag is set, then the following holds for the remaining bits +of FIL_PAGE_TYPE: +Bits 0..7 will contain the compressed page size in bytes. +Bits 8..14 are reserved and must be 0. */ +constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15; +/* @} */ + +/** @return whether the page type is B-tree or R-tree index */ +inline bool fil_page_type_is_index(uint16_t page_type) +{ + switch (page_type) { + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + return(true); + } + return(false); +} + +/** Check whether the page is index page (either regular Btree index or Rtree +index */ +#define fil_page_index_page_check(page) \ + fil_page_type_is_index(fil_page_get_type(page)) + +/** Get the file page type. +@param[in] page file page +@return page type */ +inline uint16_t fil_page_get_type(const byte *page) +{ + return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE)); +} + +#ifndef UNIV_INNOCHECKSUM + +/** Number of pending tablespace flushes */ +extern Atomic_counter<ulint> fil_n_pending_tablespace_flushes; + +/** Look up a tablespace. +The caller should hold an InnoDB table lock or a MDL that prevents +the tablespace from being dropped during the operation, +or the caller should be in single-threaded crash recovery mode +(no user connections that could drop tablespaces). +Normally, fil_space_t::get() should be used instead. +@param[in] id tablespace ID +@return tablespace, or NULL if not found */ +fil_space_t* +fil_space_get( + ulint id) + MY_ATTRIBUTE((warn_unused_result)); + +/** The tablespace memory cache; also the totality of logs (the log +data space) is stored here; below we talk about tablespaces */ +struct fil_system_t { + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + fil_system_t(): m_initialised(false) + { + UT_LIST_INIT(space_list, &fil_space_t::space_list); + UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces); + } + + bool is_initialised() const { return m_initialised; } + + /** + Create the file system interface at database start. + + @param[in] hash_size hash table size + */ + void create(ulint hash_size); + + /** Close the file system interface at shutdown */ + void close(); + +private: + bool m_initialised; +#ifdef UNIV_LINUX + /** available block devices that reside on non-rotational storage */ + std::vector<dev_t> ssd; +public: + /** @return whether a file system device is on non-rotational storage */ + bool is_ssd(dev_t dev) const + { + /* Linux seems to allow up to 15 partitions per block device. + If the detected ssd carries "partition number 0" (it is the whole device), + compare the candidate file system number without the partition number. */ + for (const auto s : ssd) + if (dev == s || (dev & ~15U) == s) + return true; + return false; + } +#endif +public: + /** Detach a tablespace from the cache and close the files. + @param space tablespace + @param detach_handle whether to detach or close handles + @return detached handles or empty vector */ + std::vector<pfs_os_file_t> detach(fil_space_t *space, + bool detach_handle= false); + + ib_mutex_t mutex; /*!< The mutex protecting the cache */ + fil_space_t* sys_space; /*!< The innodb_system tablespace */ + fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ + /** Map of fil_space_t::id to fil_space_t* */ + hash_table_t spaces; + /** tablespaces for which fil_space_t::needs_flush() holds */ + sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces; + /** number of currently open files; protected by mutex */ + ulint n_open; + ulint max_assigned_id;/*!< maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + /** nonzero if fil_node_open_file_low() should avoid moving the tablespace + to the end of space_list, for FIFO policy of try_to_close() */ + ulint freeze_space_list; + UT_LIST_BASE_NODE_T(fil_space_t) space_list; + /*!< list of all file spaces */ + UT_LIST_BASE_NODE_T(fil_space_t) named_spaces; + /*!< list of all file spaces + for which a FILE_MODIFY + record has been written since + the latest redo log checkpoint. + Protected only by log_sys.mutex. */ + + /** List of all file spaces need key rotation */ + ilist<fil_space_t, rotation_list_tag_t> default_encrypt_tables; + + bool space_id_reuse_warned; + /*!< whether fil_space_t::create() + has issued a warning about + potential space_id reuse */ + + /** Return the next tablespace from default_encrypt_tables list. + @param space previous tablespace (nullptr to start from the start) + @param recheck whether the removal condition needs to be rechecked after + the encryption parameters were changed + @param encrypt expected state of innodb_encrypt_tables + @return the next tablespace to process (n_pending_ops incremented) + @retval fil_system.temp_space if there is no work to do + @retval nullptr upon reaching the end of the iteration */ + inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck, + bool encrypt); + + /** Extend all open data files to the recovered size */ + ATTRIBUTE_COLD void extend_to_recv_size(); +}; + +/** The tablespace memory cache. */ +extern fil_system_t fil_system; + +inline void fil_space_t::reacquire() +{ + ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed); + ut_d(if (mutex_own(&fil_system.mutex)) return); + ut_ad(n & PENDING); + ut_ad(UT_LIST_GET_FIRST(chain)->is_open()); +} + +/** Note that operations on the tablespace must stop or can resume */ +inline void fil_space_t::set_stopping(bool stopping) +{ + ut_ad(mutex_own(&fil_system.mutex)); + ut_d(auto n=) n_pending.fetch_xor(STOPPING, std::memory_order_relaxed); + ut_ad(!(n & STOPPING) == stopping); +} + +/** Flush pending writes from the file system cache to the file. */ +template<bool have_reference> inline void fil_space_t::flush() +{ + ut_ad(!mutex_own(&fil_system.mutex)); + ut_ad(!have_reference || (pending() & PENDING)); + ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT); + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) + { + ut_ad(!is_in_unflushed_spaces); + ut_ad(!needs_flush()); + } + else if (have_reference) + flush_low(); + else if (!(acquire_low() & STOPPING)) + { + flush_low(); + release(); + } +} + +/** @return the size in pages (0 if unreadable) */ +inline uint32_t fil_space_t::get_size() +{ + if (!size) + { + mutex_enter(&fil_system.mutex); + read_page0(); + mutex_exit(&fil_system.mutex); + } + return size; +} + +#include "fil0crypt.h" + +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return true if assigned, false if not */ +bool +fil_assign_new_space_id( +/*====================*/ + ulint* space_id); /*!< in/out: space id */ + +/** Frees a space object from the tablespace memory cache. +Closes the files in the chain but does not delete them. +There must not be any pending i/o's or flushes on the files. +@param[in] id tablespace identifier +@param[in] x_latched whether the caller holds X-mode space->latch +@return true if success */ +bool +fil_space_free( + ulint id, + bool x_latched); + +/** Set the recovered size of a tablespace in pages. +@param id tablespace ID +@param size recovered size in pages +@param flags tablespace flags */ +void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, + uint32_t flags); + +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id);/*!< in: maximum known id */ + +/** Write the flushed LSN to the page header of the first page in the +system tablespace. +@param[in] lsn flushed LSN +@return DB_SUCCESS or error number */ +dberr_t +fil_write_flushed_lsn( + lsn_t lsn) +MY_ATTRIBUTE((warn_unused_result)); + +/** Delete a tablespace and associated .ibd file. +@param[in] id tablespace identifier +@param[in] if_exists whether to ignore missing tablespace +@param[out] leaked_handles return detached handles here +@return DB_SUCCESS or error */ +dberr_t +fil_delete_tablespace(ulint id, bool if_exists= false, + std::vector<pfs_os_file_t> *detached_handles= nullptr); + +/** Prepare to truncate an undo tablespace. +@param[in] space_id undo tablespace id +@return the tablespace +@retval NULL if the tablespace does not exist */ +fil_space_t* fil_truncate_prepare(ulint space_id); + +/** Close a single-table tablespace on failed IMPORT TABLESPACE. +The tablespace must be cached in the memory cache. +Free all pages used by the tablespace. */ +void fil_close_tablespace(ulint id); + +/*******************************************************************//** +Allocates and builds a file name from a path, a table or tablespace name +and a suffix. The string must be freed by caller with ut_free(). +@param[in] path NULL or the directory path or the full path and filename. +@param[in] name NULL if path is full, or Table/Tablespace name +@param[in] suffix NULL or the file extention to use. +@return own: file name */ +char* +fil_make_filepath( + const char* path, + const char* name, + ib_extention suffix, + bool strip_name); + +/** Create a tablespace file. +@param[in] space_id Tablespace ID +@param[in] name Tablespace name in dbname/tablename format. +@param[in] path Path and filename of the datafile to create. +@param[in] flags Tablespace flags +@param[in] size Initial size of the tablespace file in pages, +must be >= FIL_IBD_FILE_INITIAL_SIZE +@param[in] mode MariaDB encryption mode +@param[in] key_id MariaDB encryption key_id +@param[out] err DB_SUCCESS or error code +@return the created tablespace +@retval NULL on error */ +fil_space_t* +fil_ibd_create( + ulint space_id, + const char* name, + const char* path, + ulint flags, + uint32_t size, + fil_encryption_t mode, + uint32_t key_id, + dberr_t* err) + MY_ATTRIBUTE((nonnull(2,8), warn_unused_result)); + +/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations. +(Typically when upgrading from MariaDB 10.1.0..10.1.20.) +@param[in,out] space tablespace +@param[in] flags desired tablespace flags */ +void fsp_flags_try_adjust(fil_space_t* space, ulint flags); + +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks the space id is +right in it. If does not succeed, prints an error message to the .err log. This +function is used to open a tablespace when we start up mysqld, and also in +IMPORT TABLESPACE. +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially false, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@param[in] validate true if we should validate the tablespace +@param[in] fix_dict true if the dictionary is available to be fixed +@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY +@param[in] id tablespace ID +@param[in] flags expected FSP_SPACE_FLAGS +@param[in] tablename table name +If file-per-table, it is the table name in the databasename/tablename format +@param[in] path_in expected filepath, usually read from dictionary +@param[out] err DB_SUCCESS or error code +@return tablespace +@retval NULL if the tablespace could not be opened */ +fil_space_t* +fil_ibd_open( + bool validate, + bool fix_dict, + fil_type_t purpose, + ulint id, + ulint flags, + const table_name_t& tablename, + const char* path_in, + dberr_t* err = NULL) + MY_ATTRIBUTE((warn_unused_result)); + +enum fil_load_status { + /** The tablespace file(s) were found and valid. */ + FIL_LOAD_OK, + /** The name no longer matches space_id */ + FIL_LOAD_ID_CHANGED, + /** The file(s) were not found */ + FIL_LOAD_NOT_FOUND, + /** The file(s) were not valid */ + FIL_LOAD_INVALID +}; + +/** Open a single-file tablespace and add it to the InnoDB data structures. +@param[in] space_id tablespace ID +@param[in] filename path/to/databasename/tablename.ibd +@param[out] space the tablespace, or NULL on error +@return status of the operation */ +enum fil_load_status +fil_ibd_load( + ulint space_id, + const char* filename, + fil_space_t*& space) + MY_ATTRIBUTE((warn_unused_result)); + + +/** Determine if a matching tablespace exists in the InnoDB tablespace +memory cache. Note that if we have not done a crash recovery at the database +startup, there may be many tablespaces which are not yet in the memory cache. +@param[in] id Tablespace ID +@param[in] name Tablespace name used in fil_space_t::create(). +@param[in] table_flags table flags +@return the tablespace +@retval NULL if no matching tablespace exists in the memory cache */ +fil_space_t* +fil_space_for_table_exists_in_mem( + ulint id, + const char* name, + ulint table_flags); + +/** Try to extend a tablespace if it is smaller than the specified size. +@param[in,out] space tablespace +@param[in] size desired size in pages +@return whether the tablespace is at least as big as requested */ +bool fil_space_extend(fil_space_t *space, uint32_t size); + +/** Flush to disk the writes in file spaces of the given type +possibly cached by the OS. */ +void fil_flush_file_spaces(); +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return true if ok */ +bool fil_validate(); +/*********************************************************************//** +Sets the file page type. */ +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type); /*!< in: type */ + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +void +fil_delete_file( +/*============*/ + const char* path); /*!< in: filepath of the ibd tablespace */ + +/********************************************************************//** +Looks for a pre-existing fil_space_t with the given tablespace ID +and, if found, returns the name and filepath in newly allocated buffers that the caller must free. +@param[in] space_id The tablespace ID to search for. +@param[out] name Name of the tablespace found. +@param[out] fileapth The filepath of the first datafile for thtablespace found. +@return true if tablespace is found, false if not. */ +bool +fil_space_read_name_and_filepath( + ulint space_id, + char** name, + char** filepath); + +/** Convert a file name to a tablespace name. +@param[in] filename directory/databasename/tablename.ibd +@return database/tablename string, to be freed with ut_free() */ +char* +fil_path_to_space_name( + const char* filename); + +/** Acquire the fil_system mutex. */ +#define fil_system_enter() mutex_enter(&fil_system.mutex) +/** Release the fil_system mutex. */ +#define fil_system_exit() mutex_exit(&fil_system.mutex) + +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ + +/** Note that a non-predefined persistent tablespace has been modified +by redo log. +@param[in,out] space tablespace */ +void +fil_names_dirty( + fil_space_t* space); + +/** Write FILE_MODIFY records when a non-predefined persistent +tablespace was modified for the first time since the latest +fil_names_clear(). +@param[in,out] space tablespace */ +void fil_names_dirty_and_write(fil_space_t* space); + +/** Write FILE_MODIFY records if a persistent tablespace was modified +for the first time since the latest fil_names_clear(). +@param[in,out] space tablespace +@param[in,out] mtr mini-transaction +@return whether any FILE_MODIFY record was written */ +inline bool fil_names_write_if_was_clean(fil_space_t* space) +{ + mysql_mutex_assert_owner(&log_sys.mutex); + + if (space == NULL) { + return(false); + } + + const bool was_clean = space->max_lsn == 0; + ut_ad(space->max_lsn <= log_sys.get_lsn()); + space->max_lsn = log_sys.get_lsn(); + + if (was_clean) { + fil_names_dirty_and_write(space); + } + + return(was_clean); +} + +/** On a log checkpoint, reset fil_names_dirty_and_write() flags +and write out FILE_MODIFY and FILE_CHECKPOINT if needed. +@param[in] lsn checkpoint LSN +@param[in] do_write whether to always write FILE_CHECKPOINT +@return whether anything was written to the redo log +@retval false if no flags were set and nothing written +@retval true if anything was written to the redo log */ +bool +fil_names_clear( + lsn_t lsn, + bool do_write); + +#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH +void test_make_filepath(); +#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */ + +/** Determine the block size of the data file. +@param[in] space tablespace +@param[in] offset page number +@return block size */ +UNIV_INTERN +ulint +fil_space_get_block_size(const fil_space_t* space, unsigned offset); + +#include "fil0fil.ic" +#endif /* UNIV_INNOCHECKSUM */ + +#endif /* fil0fil_h */ diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic new file mode 100644 index 00000000..fd5f5bc1 --- /dev/null +++ b/storage/innobase/include/fil0fil.ic @@ -0,0 +1,144 @@ +/***************************************************************************** + +Copyright (c) 2015, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0fil.ic +The low-level file system support functions + +Created 31/03/2015 Jan Lindström +*******************************************************/ + +#ifndef fil0fil_ic +#define fil0fil_ic + +/*******************************************************************//** +Return page type name */ +UNIV_INLINE +const char* +fil_get_page_type_name( +/*===================*/ + ulint page_type) /*!< in: FIL_PAGE_TYPE */ +{ + switch(page_type) { + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + return "PAGE_COMPRESSED_ENRYPTED"; + case FIL_PAGE_PAGE_COMPRESSED: + return "PAGE_COMPRESSED"; + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + return "INDEX"; + case FIL_PAGE_RTREE: + return "RTREE"; + case FIL_PAGE_UNDO_LOG: + return "UNDO LOG"; + case FIL_PAGE_INODE: + return "INODE"; + case FIL_PAGE_IBUF_FREE_LIST: + return "IBUF_FREE_LIST"; + case FIL_PAGE_TYPE_ALLOCATED: + return "ALLOCATED"; + case FIL_PAGE_IBUF_BITMAP: + return "IBUF_BITMAP"; + case FIL_PAGE_TYPE_SYS: + return "SYS"; + case FIL_PAGE_TYPE_TRX_SYS: + return "TRX_SYS"; + case FIL_PAGE_TYPE_FSP_HDR: + return "FSP_HDR"; + case FIL_PAGE_TYPE_XDES: + return "XDES"; + case FIL_PAGE_TYPE_BLOB: + return "BLOB"; + case FIL_PAGE_TYPE_ZBLOB: + return "ZBLOB"; + case FIL_PAGE_TYPE_ZBLOB2: + return "ZBLOB2"; + case FIL_PAGE_TYPE_UNKNOWN: + return "OLD UNKNOWN PAGE TYPE"; + default: + return "PAGE TYPE CORRUPTED"; + } +} + +#ifdef UNIV_DEBUG +/** Validate page type. +@param[in] space Tablespace object +@param[in] page page to validate +@return true if valid, false if not */ +UNIV_INLINE +bool +fil_page_type_validate( + fil_space_t* space, + const byte* page) +{ + const uint16_t page_type = fil_page_get_type(page); + + if ((page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + && space->full_crc32() + && space->is_compressed()) { + return true; + } + + /* Validate page type */ + if (!((page_type == FIL_PAGE_PAGE_COMPRESSED || + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED || + page_type == FIL_PAGE_INDEX || + page_type == FIL_PAGE_TYPE_INSTANT || + page_type == FIL_PAGE_RTREE || + page_type == FIL_PAGE_UNDO_LOG || + page_type == FIL_PAGE_INODE || + page_type == FIL_PAGE_IBUF_FREE_LIST || + page_type == FIL_PAGE_TYPE_ALLOCATED || + page_type == FIL_PAGE_IBUF_BITMAP || + page_type == FIL_PAGE_TYPE_SYS || + page_type == FIL_PAGE_TYPE_TRX_SYS || + page_type == FIL_PAGE_TYPE_FSP_HDR || + page_type == FIL_PAGE_TYPE_XDES || + page_type == FIL_PAGE_TYPE_BLOB || + page_type == FIL_PAGE_TYPE_ZBLOB || + page_type == FIL_PAGE_TYPE_ZBLOB2 || + page_type == FIL_PAGE_TYPE_UNKNOWN))) { + + ulint space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + ulint offset = mach_read_from_4(page + FIL_PAGE_OFFSET); + + ulint key_version = mach_read_from_4( + page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + if (space && space->full_crc32()) { + key_version = mach_read_from_4( + page + FIL_PAGE_FCRC32_KEY_VERSION); + } + + /* Dump out the page info */ + ib::fatal() << "Page " << space_id << ":" << offset + << " name " << (space ? space->name : "???") + << " page_type " << page_type + << " key_version " << key_version + << " lsn " << mach_read_from_8(page + FIL_PAGE_LSN) + << " compressed_len " << mach_read_from_2(page + FIL_PAGE_DATA); + return false; + } + + return true; +} +#endif /* UNIV_DEBUG */ + +#endif /* fil0fil_ic */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h new file mode 100644 index 00000000..c6ba24fa --- /dev/null +++ b/storage/innobase/include/fil0pagecompress.h @@ -0,0 +1,60 @@ +/***************************************************************************** + +Copyright (C) 2013, 2019 MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/** Compress a page_compressed page before writing to a data file. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@param[in] encrypted whether the page will be subsequently encrypted +@return actual length of compressed page +@retval 0 if the page was not compressed */ +ulint fil_page_compress( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Decompress a page that may be subject to page_compressed compression. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf compressed page buffer +@param[in] flags talespace flags +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +ulint fil_page_decompress( + byte* tmp_buf, + byte* buf, + ulint flags) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h new file mode 100644 index 00000000..7db85e87 --- /dev/null +++ b/storage/innobase/include/fsp0file.h @@ -0,0 +1,576 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0file.h +Tablespace data file implementation. + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#ifndef fsp0file_h +#define fsp0file_h + +#include "mem0mem.h" +#include "os0file.h" +#include "fil0fil.h" + +/** Types of raw partitions in innodb_data_file_path */ +enum device_t { + SRV_NOT_RAW = 0, /*!< Not a raw partition */ + SRV_NEW_RAW, /*!< A 'newraw' partition, only to be + initialized */ + SRV_OLD_RAW /*!< An initialized raw partition */ +}; + +/** Data file control information. */ +class Datafile { + + friend class Tablespace; + friend class SysTablespace; + +public: + + Datafile() + : + m_name(), + m_filepath(), + m_filename(), + m_handle(), + m_open_flags(OS_FILE_OPEN), + m_size(), + m_order(), + m_type(SRV_NOT_RAW), + m_space_id(ULINT_UNDEFINED), + m_flags(), + m_exists(), + m_is_valid(), + m_first_page(), + m_last_os_error(), + m_file_info() + { + /* No op */ + } + + Datafile(const char* name, ulint flags, uint32_t size, ulint order) + : + m_name(mem_strdup(name)), + m_filepath(), + m_filename(), + m_handle(), + m_open_flags(OS_FILE_OPEN), + m_size(size), + m_order(order), + m_type(SRV_NOT_RAW), + m_space_id(ULINT_UNDEFINED), + m_flags(flags), + m_exists(), + m_is_valid(), + m_first_page(), + m_last_os_error(), + m_file_info() + { + ut_ad(m_name != NULL); + /* No op */ + } + + Datafile(const Datafile& file) + : + m_handle(file.m_handle), + m_open_flags(file.m_open_flags), + m_size(file.m_size), + m_order(file.m_order), + m_type(file.m_type), + m_space_id(file.m_space_id), + m_flags(file.m_flags), + m_exists(file.m_exists), + m_is_valid(file.m_is_valid), + m_first_page(), + m_last_os_error(), + m_file_info() + { + m_name = mem_strdup(file.m_name); + ut_ad(m_name != NULL); + + if (file.m_filepath != NULL) { + m_filepath = mem_strdup(file.m_filepath); + ut_a(m_filepath != NULL); + set_filename(); + } else { + m_filepath = NULL; + m_filename = NULL; + } + } + + virtual ~Datafile() + { + shutdown(); + } + + Datafile& operator=(const Datafile& file) + { + ut_a(this != &file); + + ut_ad(m_name == NULL); + m_name = mem_strdup(file.m_name); + ut_a(m_name != NULL); + + m_size = file.m_size; + m_order = file.m_order; + m_type = file.m_type; + + ut_a(m_handle == OS_FILE_CLOSED); + m_handle = file.m_handle; + + m_exists = file.m_exists; + m_is_valid = file.m_is_valid; + m_open_flags = file.m_open_flags; + m_space_id = file.m_space_id; + m_flags = file.m_flags; + m_last_os_error = 0; + + if (m_filepath != NULL) { + ut_free(m_filepath); + m_filepath = NULL; + m_filename = NULL; + } + + if (file.m_filepath != NULL) { + m_filepath = mem_strdup(file.m_filepath); + ut_a(m_filepath != NULL); + set_filename(); + } + + /* Do not make a copy of the first page, + it should be reread if needed */ + m_first_page = NULL; + + return(*this); + } + + /** Initialize the name and flags of this datafile. + @param[in] name tablespace name, will be copied + @param[in] flags tablespace flags */ + void init(const char* name, ulint flags); + + /** Release the resources. */ + virtual void shutdown(); + + /** Open a data file in read-only mode to check if it exists + so that it can be validated. + @param[in] strict whether to issue error messages + @return DB_SUCCESS or error code */ + virtual dberr_t open_read_only(bool strict); + + /** Open a data file in read-write mode during start-up so that + doublewrite pages can be restored and then it can be validated. + @param[in] read_only_mode if true, then readonly mode checks + are enforced. + @return DB_SUCCESS or error code */ + virtual dberr_t open_read_write(bool read_only_mode) + MY_ATTRIBUTE((warn_unused_result)); + + /** Initialize OS specific file info. */ + void init_file_info(); + + /** Close a data file. + @return DB_SUCCESS or error code */ + dberr_t close(); + + /** Make a full filepath from a directory path and a filename. + Prepend the dirpath to filename using the extension given. + If dirpath is NULL, prepend the default datadir to filepath. + Store the result in m_filepath. + @param[in] dirpath directory path + @param[in] filename filename or filepath + @param[in] ext filename extension */ + void make_filepath( + const char* dirpath, + const char* filename, + ib_extention ext); + + /** Set the filepath by duplicating the filepath sent in */ + void set_filepath(const char* filepath); + + /** Allocate and set the datafile or tablespace name in m_name. + If a name is provided, use it; else extract a file-per-table + tablespace name from m_filepath. The value of m_name + will be freed in the destructor. + @param[in] name Tablespace Name if known, NULL if not */ + void set_name(const char* name); + + /** Validates the datafile and checks that it conforms with + the expected space ID and flags. The file should exist and be + successfully opened in order for this function to validate it. + @param[in] space_id The expected tablespace ID. + @param[in] flags The expected tablespace flags. + @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. + m_is_valid is also set true on success, else false. */ + dberr_t validate_to_dd(ulint space_id, ulint flags) + MY_ATTRIBUTE((warn_unused_result)); + + /** Validates this datafile for the purpose of recovery. + The file should exist and be successfully opened. We initially + open it in read-only mode because we just want to read the SpaceID. + However, if the first page is corrupt and needs to be restored + from the doublewrite buffer, we will reopen it in write mode and + ry to restore that page. + @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. + m_is_valid is also set true on success, else false. */ + dberr_t validate_for_recovery() + MY_ATTRIBUTE((warn_unused_result)); + + /** Checks the consistency of the first page of a datafile when the + tablespace is opened. This occurs before the fil_space_t is created + so the Space ID found here must not already be open. + m_is_valid is set true on success, else false. + @param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN + @retval DB_SUCCESS on if the datafile is valid + @retval DB_CORRUPTION if the datafile is not readable + @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ + dberr_t validate_first_page(lsn_t* flush_lsn) + MY_ATTRIBUTE((warn_unused_result)); + + /** Get Datafile::m_name. + @return m_name */ + const char* name() const + { + return(m_name); + } + + /** Get Datafile::m_filepath. + @return m_filepath */ + const char* filepath() const + { + return(m_filepath); + } + + /** Get Datafile::m_handle. + @return m_handle */ + pfs_os_file_t handle() const + { + return(m_handle); + } + + /** @return detached file handle */ + pfs_os_file_t detach() + { + pfs_os_file_t detached = m_handle; + m_handle = OS_FILE_CLOSED; + return detached; + } + + /** Get Datafile::m_order. + @return m_order */ + ulint order() const + { + return(m_order); + } + + /** Get Datafile::m_space_id. + @return m_space_id */ + ulint space_id() const + { + return(m_space_id); + } + + /** Get Datafile::m_flags. + @return m_flags */ + ulint flags() const + { + return(m_flags); + } + + /** + @return true if m_handle is open, false if not */ + bool is_open() const + { + return(m_handle != OS_FILE_CLOSED); + } + + /** Get Datafile::m_is_valid. + @return m_is_valid */ + bool is_valid() const + { + return(m_is_valid); + } + + /** Get the last OS error reported + @return m_last_os_error */ + ulint last_os_error() const + { + return(m_last_os_error); + } + + /** Check whether the file is empty. + @return true if file is empty */ + bool is_empty_file() const + { +#ifdef _WIN32 + os_offset_t offset = + (os_offset_t) m_file_info.nFileSizeLow + | ((os_offset_t) m_file_info.nFileSizeHigh << 32); + + return (offset == 0); +#else + return (m_file_info.st_size == 0); +#endif + } + + /** Check if the file exist. + @return true if file exists. */ + bool exists() const { return m_exists; } + + /** Test if the filepath provided looks the same as this filepath + by string comparison. If they are two different paths to the same + file, same_as() will be used to show that after the files are opened. + @param[in] other filepath to compare with + @retval true if it is the same filename by char comparison + @retval false if it looks different */ + bool same_filepath_as(const char* other) const; + + /** Test if another opened datafile is the same file as this object. + @param[in] other Datafile to compare with + @return true if it is the same file, else false */ + bool same_as(const Datafile& other) const; + + /** Get access to the first data page. + It is valid after open_read_only() succeeded. + @return the first data page */ + const byte* get_first_page() const { return(m_first_page); } + +private: + /** Free the filepath buffer. */ + void free_filepath(); + + /** Set the filename pointer to the start of the file name + in the filepath. */ + void set_filename() + { + if (m_filepath == NULL) { + return; + } + + char* last_slash = strrchr(m_filepath, OS_PATH_SEPARATOR); + + m_filename = last_slash ? last_slash + 1 : m_filepath; + } + + /** Create/open a data file. + @param[in] read_only_mode if true, then readonly mode checks + are enforced. + @return DB_SUCCESS or error code */ + dberr_t open_or_create(bool read_only_mode) + MY_ATTRIBUTE((warn_unused_result)); + + /** Reads a few significant fields from the first page of the + datafile, which must already be open. + @param[in] read_only_mode if true, then readonly mode checks + are enforced. + @return DB_SUCCESS or DB_IO_ERROR if page cannot be read */ + dberr_t read_first_page(bool read_only_mode) + MY_ATTRIBUTE((warn_unused_result)); + + /** Free the first page from memory when it is no longer needed. */ + void free_first_page(); + + /** Set the Datafile::m_open_flags. + @param open_flags The Open flags to set. */ + void set_open_flags(os_file_create_t open_flags) + { + m_open_flags = open_flags; + }; + + /** Determine if this datafile is on a Raw Device + @return true if it is a RAW device. */ + bool is_raw_device() + { + return(m_type != SRV_NOT_RAW); + } + + /* DATA MEMBERS */ + + /** Datafile name at the tablespace location. + This is either the basename of the file if an absolute path + was entered, or it is the relative path to the datadir or + Tablespace::m_path. */ + char* m_name; + +protected: + /** Physical file path with base name and extension */ + char* m_filepath; + +private: + /** Determine the space id of the given file descriptor by reading + a few pages from the beginning of the .ibd file. + @return DB_SUCCESS if space id was successfully identified, + else DB_ERROR. */ + dberr_t find_space_id(); + + /** Restore the first page of the tablespace from + the double write buffer. + @return whether the operation failed */ + bool restore_from_doublewrite(); + + /** Points into m_filepath to the file name with extension */ + char* m_filename; + + /** Open file handle */ + pfs_os_file_t m_handle; + + /** Flags to use for opening the data file */ + os_file_create_t m_open_flags; + + /** size in megabytes or pages; converted from megabytes to + pages in SysTablespace::normalize_size() */ + uint32_t m_size; + + /** ordinal position of this datafile in the tablespace */ + ulint m_order; + + /** The type of the data file */ + device_t m_type; + + /** Tablespace ID. Contained in the datafile header. + If this is a system tablespace, FSP_SPACE_ID is only valid + in the first datafile. */ + ulint m_space_id; + + /** Tablespace flags. Contained in the datafile header. + If this is a system tablespace, FSP_SPACE_FLAGS are only valid + in the first datafile. */ + ulint m_flags; + + /** true if file already existed on startup */ + bool m_exists; + + /* true if the tablespace is valid */ + bool m_is_valid; + + /** Aligned buffer to hold first page */ + byte* m_first_page; + +protected: + /** Last OS error received so it can be reported if needed. */ + ulint m_last_os_error; + +public: + /** Use the following to determine the uniqueness of this datafile. */ +#ifdef _WIN32 + /* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */ + BY_HANDLE_FILE_INFORMATION m_file_info; +#else + /* Use field st_ino. */ + struct stat m_file_info; +#endif /* WIN32 */ +}; + + +/** Data file control information. */ +class RemoteDatafile : public Datafile +{ +private: + /** Link filename (full path) */ + char* m_link_filepath; + +public: + + RemoteDatafile() + : + m_link_filepath() + { + /* No op - base constructor is called. */ + } + + RemoteDatafile(const char*, ulint, ulint) + : + m_link_filepath() + { + /* No op - base constructor is called. */ + } + + ~RemoteDatafile() override + { + shutdown(); + } + + /** Release the resources. */ + void shutdown() override; + + /** Get the link filepath. + @return m_link_filepath */ + const char* link_filepath() const + { + return(m_link_filepath); + } + + /** Create a link filename based on the contents of m_name, + open that file, and read the contents into m_filepath. + @retval DB_SUCCESS if remote linked tablespace file is opened and read. + @retval DB_CANNOT_OPEN_FILE if the link file does not exist. */ + dberr_t open_link_file(); + + /** Delete an InnoDB Symbolic Link (ISL) file. */ + void delete_link_file(void); + + /** Open a handle to the file linked to in an InnoDB Symbolic Link file + in read-only mode so that it can be validated. + @param[in] strict whether to issue error messages + @return DB_SUCCESS or error code */ + dberr_t open_read_only(bool strict) override; + + /** Opens a handle to the file linked to in an InnoDB Symbolic Link + file in read-write mode so that it can be restored from doublewrite + and validated. + @param[in] read_only_mode If true, then readonly mode checks + are enforced. + @return DB_SUCCESS or error code */ + dberr_t open_read_write(bool read_only_mode) override + MY_ATTRIBUTE((warn_unused_result)); + + /****************************************************************** + Global Static Functions; Cannot refer to data members. + ******************************************************************/ + + /** Creates a new InnoDB Symbolic Link (ISL) file. It is always + created under the 'datadir' of MySQL. The datadir is the directory + of a running mysqld program. We can refer to it by simply using + the path ".". + @param[in] name tablespace name + @param[in] filepath remote filepath of tablespace datafile + @return DB_SUCCESS or error code */ + static dberr_t create_link_file( + const char* name, + const char* filepath); + + /** Delete an InnoDB Symbolic Link (ISL) file by name. + @param[in] name tablespace name */ + static void delete_link_file(const char* name); + + /** Read an InnoDB Symbolic Link (ISL) file by name. + It is always created under the datadir of MySQL. + For file-per-table tablespaces, the isl file is expected to be + in a 'database' directory and called 'tablename.isl'. + The caller must free the memory returned if it is not null. + @param[in] link_filepath filepath of the ISL file + @return Filepath of the IBD file read from the ISL file */ + static char* read_link_file( + const char* link_filepath); +}; +#endif /* fsp0file_h */ diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h new file mode 100644 index 00000000..7245db39 --- /dev/null +++ b/storage/innobase/include/fsp0fsp.h @@ -0,0 +1,761 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0fsp.h +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fsp0fsp_h +#define fsp0fsp_h + +#include "assume_aligned.h" +#include "fsp0types.h" +#include "fut0lst.h" +#include "ut0byte.h" + +#ifndef UNIV_INNOCHECKSUM +#include "mtr0mtr.h" +#include "page0types.h" +#include "rem0types.h" +#else +# include "mach0data.h" +#endif /* !UNIV_INNOCHECKSUM */ + +/** @return the PAGE_SSIZE flags for the current innodb_page_size */ +#define FSP_FLAGS_PAGE_SSIZE() \ + ((srv_page_size == UNIV_PAGE_SIZE_ORIG) ? \ + 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + << FSP_FLAGS_POS_PAGE_SSIZE) + +/** @return the PAGE_SSIZE flags for the current innodb_page_size in +full checksum format */ +#define FSP_FLAGS_FCRC32_PAGE_SSIZE() \ + ((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + +/* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20; +see the table in fsp0types.h @{ */ +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 \ + (FSP_FLAGS_POS_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 \ + (FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 + 1) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 \ + (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 + 4) +/** Zero relative shift position of the PAGE_SSIZE field */ +#define FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101 \ + (FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 + 2) + +/** Bit mask of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101 \ + (1U << FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101 \ + (15U << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101) +/** Bit mask of the ATOMIC_WRITES field */ +#define FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101 \ + (3U << FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101) +/** Bit mask of the PAGE_SSIZE field */ +#define FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101 \ + (15U << FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101) + +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101) +/** Return the value of the PAGE_SSIZE field */ +#define FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101) \ + >> FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101) + +/* @} */ + +/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ + +/** Offset of the space header within a file page */ +#define FSP_HEADER_OFFSET FIL_PAGE_DATA + +/* The data structures in files are defined just as byte strings in C */ +typedef byte xdes_t; + +/* SPACE HEADER + ============ + +File space header data structure: this data structure is contained in the +first page of a space. The space for this header is reserved in every extent +descriptor page, but used only in the first. */ + +/*-------------------------------------*/ +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to + which we know that the modifications + in the database have been flushed to + the file space; not used now */ +#define FSP_SIZE 8 /* Current size of the space in + pages */ +#define FSP_FREE_LIMIT 12 /* Minimum page number for which the + free list has not been initialized: + the pages >= this limit are, by + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocated those pages to the + file */ +#define FSP_SPACE_FLAGS 16 /* fsp_space_t.flags, similar to + dict_table_t::flags */ +#define FSP_FRAG_N_USED 20 /* number of used pages in the + FSP_FREE_FRAG list */ +#define FSP_FREE 24 /* list of free extents */ +#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE) + /* list of partially free extents not + belonging to any segment */ +#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents not belonging + to any segment */ +#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE) + /* 8 bytes which give the first unused + segment id */ +#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where all the segment inode + slots are reserved */ +#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where not all the segment + header slots are reserved */ +/*-------------------------------------*/ +/* File space header size */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define FSP_FREE_ADD 4 /* this many free extents are added + to the free list from above + FSP_FREE_LIMIT at a time */ +/* @} */ + +/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */ + +/* FILE SEGMENT INODE + ================== + +Segment inode which is created for each segment in a tablespace. NOTE: in +purge we assume that a segment having only one currently used page can be +freed in a few steps, so that the freeing cannot fill the file buffer with +bufferfixed file pages. */ + +typedef byte fseg_inode_t; + +#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA + /* the list node for linking + segment inode pages */ + +#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE) +/*-------------------------------------*/ +#define FSEG_ID 0 /* 8 bytes of segment id: if this is 0, + it means that the header is unused */ +#define FSEG_NOT_FULL_N_USED 8 + /* number of used segment pages in + the FSEG_NOT_FULL list */ +#define FSEG_FREE 12 + /* list of free extents of this + segment */ +#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE) + /* list of partially free extents */ +#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents */ +#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE) + /* magic number used in debugging */ +#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE) + /* array of individual pages + belonging to this segment in fsp + fragment extent lists */ +#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2) + /* number of slots in the array for + the fragment pages */ +#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its + page number within space, FIL_NULL + means that the slot is not in use */ +/*-------------------------------------*/ +#define FSEG_INODE_SIZE \ + (16 + 3 * FLST_BASE_NODE_SIZE \ + + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) + +static constexpr uint32_t FSEG_MAGIC_N_VALUE= 97937874; + +#define FSEG_FILLFACTOR 8 /* If this value is x, then if + the number of unused but reserved + pages in a segment is less than + reserved pages * 1/x, and there are + at least FSEG_FRAG_LIMIT used pages, + then we allow a new empty extent to + be added to the segment in + fseg_alloc_free_page. Otherwise, we + use unused pages of the segment. */ + +#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS + /* If the segment has >= this many + used pages, it may be expanded by + allocating extents to the segment; + until that only individual fragment + pages are allocated from the space */ + +#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment + is at least this many extents, we + allow extents to be put to the free + list of the extent: at most + FSEG_FREE_LIST_MAX_LEN many */ +#define FSEG_FREE_LIST_MAX_LEN 4 +/* @} */ + +/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */ + +/* EXTENT DESCRIPTOR + ================= + +File extent descriptor data structure: contains bits to tell which pages in +the extent are free and which contain old tuple version to clean. */ + +/*-------------------------------------*/ +#define XDES_ID 0 /* The identifier of the segment + to which this extent belongs */ +#define XDES_FLST_NODE 8 /* The list node data structure + for the descriptors */ +#define XDES_STATE (FLST_NODE_SIZE + 8) + /* contains state information + of the extent */ +#define XDES_BITMAP (FLST_NODE_SIZE + 12) + /* Descriptor bitmap of the pages + in the extent */ +/*-------------------------------------*/ + +#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */ +#define XDES_FREE_BIT 0 /* Index of the bit which tells if + the page is free */ +#define XDES_CLEAN_BIT 1 /* NOTE: currently not used! + Index of the bit which tells if + there are old versions of tuples + on the page */ +/* States of a descriptor */ +#define XDES_FREE 1 /* extent is in free list of space */ +#define XDES_FREE_FRAG 2 /* extent is in free fragment list of + space */ +#define XDES_FULL_FRAG 3 /* extent is in full fragment list of + space */ +#define XDES_FSEG 4 /* extent belongs to a segment */ + +/** File extent data structure size in bytes. */ +#define XDES_SIZE \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MAX page size. */ +#define XDES_SIZE_MAX \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MIN page size. */ +#define XDES_SIZE_MIN \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE)) + +/** Offset of the descriptor array on a descriptor page */ +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + +/** +Determine if a page is marked free. +@param[in] descr extent descriptor +@param[in] offset page offset within extent +@return whether the page is free */ +inline bool xdes_is_free(const xdes_t *descr, ulint offset) +{ + ut_ad(offset < FSP_EXTENT_SIZE); + ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset; + return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7); +} + +#ifndef UNIV_INNOCHECKSUM +/* @} */ + +/** Read a tablespace header field. +@param[in] page first page of a tablespace +@param[in] field the header field +@return the contents of the header field */ +inline uint32_t fsp_header_get_field(const page_t* page, ulint field) +{ + return mach_read_from_4(FSP_HEADER_OFFSET + field + + my_assume_aligned<UNIV_ZIP_SIZE_MIN>(page)); +} + +/** Read the flags from the tablespace header page. +@param[in] page first page of a tablespace +@return the contents of FSP_SPACE_FLAGS */ +inline uint32_t fsp_header_get_flags(const page_t *page) +{ + return fsp_header_get_field(page, FSP_SPACE_FLAGS); +} + +/** Get the byte offset of encryption information in page 0. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return byte offset relative to FSP_HEADER_OFFSET */ +inline MY_ATTRIBUTE((pure, warn_unused_result)) +ulint fsp_header_get_encryption_offset(ulint zip_size) +{ + return zip_size + ? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE + : XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift) + / FSP_EXTENT_SIZE; +} + +/** Check the encryption key from the first page of a tablespace. +@param[in] fsp_flags tablespace flags +@param[in] page first page of a tablespace +@return true if success */ +bool +fsp_header_check_encryption_key( + ulint fsp_flags, + page_t* page); + +/**********************************************************************//** +Writes the space id and flags to a tablespace header. The flags contain +row type, physical/compressed page size, and logical/uncompressed page +size of the tablespace. */ +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /*!< in/out: first page in the space */ + ulint space_id, /*!< in: space id */ + ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS): + 0, or table->flags if newer than COMPACT */ +/** Initialize a tablespace header. +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction */ +void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); + +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval NULL if could not create segment because of lack of space */ +buf_block_t* +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, + bool has_done_reservation= false, buf_block_t *block= NULL); + +/** Calculate the number of pages reserved by a segment, +and how many pages are currently used. +@param[in] block buffer block containing the file segment header +@param[in] header file segment header +@param[out] used number of pages that are used (not more than reserved) +@param[in,out] mtr mini-transaction +@return number of reserved pages */ +ulint fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, ulint *used, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize +file space fragmentation. +@param[in,out] seg_header segment header +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@return X-latched block, or NULL if no page could be allocated */ +#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \ + fseg_alloc_free_page_general(seg_header, hint, direction, \ + false, mtr, mtr) +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated */ +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + uint32_t hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + bool has_done_reservation, /*!< in: true if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_t::release_free_extents()! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special +case. In this function we would liberally reserve several extents for +every page split or merge in a B-tree. But we do not want to waste disk space +if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply +different rules in that special case, just ensuring that there are n_pages +free pages available. + +@param[out] n_reserved number of extents actually reserved; if we + return true and the tablespace size is < + FSP_EXTENT_SIZE pages, then this can be 0, + otherwise it is n_ext +@param[in,out] space tablespace +@param[in] n_ext number of extents to reserve +@param[in] alloc_type page reservation type (FSP_BLOB, etc) +@param[in,out] mtr the mini transaction +@param[in] n_pages for small tablespaces (tablespace size is + less than FSP_EXTENT_SIZE), number of free + pages to reserve. +@return true if we were able to make the reservation */ +bool +fsp_reserve_free_extents( + uint32_t* n_reserved, + fil_space_t* space, + uint32_t n_ext, + fsp_reserve_t alloc_type, + mtr_t* mtr, + uint32_t n_pages = 2); + +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction */ +void +fseg_free_page( + fseg_header_t* seg_header, + fil_space_t* space, + uint32_t offset, + mtr_t* mtr); +/** Determine whether a page is free. +@param[in,out] space tablespace +@param[in] page page number +@return whether the page is marked as free */ +bool +fseg_page_is_free(fil_space_t* space, unsigned page) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************************//** +Frees part of a segment. This function can be used to free a segment +by repeatedly calling this function in different mini-transactions. +Doing the freeing in a single mini-transaction might result in +too big a mini-transaction. +@return whether the freeing was completed */ +bool +fseg_free_step( + fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. +@return whether the freeing was completed, except for the header page */ +bool +fseg_free_step_not_header( + fseg_header_t* header, /*!< in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Reset the page type. +Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] block block with invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr); + +/** Check (and if needed, reset) the page type. +Data files created before MySQL 5.1.48 may contain +garbage in the FIL_PAGE_TYPE field. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] page_id page number +@param[in,out] page page with possibly invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +inline void +fil_block_check_type( + const buf_block_t& block, + ulint type, + mtr_t* mtr) +{ + if (UNIV_UNLIKELY(type != fil_page_get_type(block.frame))) { + fil_block_reset_type(block, type, mtr); + } +} + +/** Checks if a page address is an extent descriptor page address. +@param[in] page_id page id +@param[in] physical_size page size +@return whether a descriptor page */ +inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size) +{ + return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET; +} + +/** Initialize a file page whose prior contents should be ignored. +@param[in,out] block buffer pool block */ +void fsp_apply_init_file_page(buf_block_t *block); + +/** Initialize a file page. +@param[in] space tablespace +@param[in,out] block file page +@param[in,out] mtr mini-transaction */ +inline void fsp_init_file_page( +#ifdef UNIV_DEBUG + const fil_space_t* space, +#endif + buf_block_t* block, mtr_t* mtr) +{ + ut_d(space->modify_check(*mtr)); + ut_ad(space->id == block->page.id().space()); + fsp_apply_init_file_page(block); + mtr->init(block); +} + +#ifndef UNIV_DEBUG +# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr) +#endif + +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +#endif /* UNIV_BTR_PRINT */ + +/** Convert FSP_SPACE_FLAGS from the buggy MariaDB 10.1.0..10.1.20 format. +@param[in] flags the contents of FSP_SPACE_FLAGS +@return the flags corrected from the buggy MariaDB 10.1 format +@retval ULINT_UNDEFINED if the flags are not in the buggy 10.1 format */ +MY_ATTRIBUTE((warn_unused_result, const)) +UNIV_INLINE +ulint +fsp_flags_convert_from_101(ulint flags) +{ + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", + return(ULINT_UNDEFINED);); + if (flags == 0 || fil_space_t::full_crc32(flags)) { + return(flags); + } + + if (flags >> 18) { + /* The most significant FSP_SPACE_FLAGS bit that was ever set + by MariaDB 10.1.0 to 10.1.20 was bit 17 (misplaced DATA_DIR flag). + The flags must be less than 1<<18 in order to be valid. */ + return(ULINT_UNDEFINED); + } + + if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)) + == FSP_FLAGS_MASK_ATOMIC_BLOBS) { + /* If the "atomic blobs" flag (indicating + ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag + is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag + must also be set. */ + return(ULINT_UNDEFINED); + } + + /* Bits 6..10 denote compression in MariaDB 10.1.0 to 10.1.20. + They must be either 0b00000 or 0b00011 through 0b10011. + In correct versions, these bits would be + 0bd0sss where d is the DATA_DIR flag (garbage bit) and + sss is the PAGE_SSIZE (3, 4, 6, or 7). + + NOTE: MariaDB 10.1.0 to 10.1.20 can misinterpret + uncompressed data files with innodb_page_size=4k or 64k as + compressed innodb_page_size=16k files. Below is an exhaustive + state space analysis. + + -0by1zzz: impossible (the bit 4 must be clean; see above) + -0b101xx: DATA_DIR, innodb_page_size>4k: invalid (COMPRESSION_LEVEL>9) + +0bx0011: innodb_page_size=4k: + !!! Misinterpreted as COMPRESSION_LEVEL=9 or 1, COMPRESSION=1. + -0bx0010: impossible, because sss must be 0b011 or 0b1xx + -0bx0001: impossible, because sss must be 0b011 or 0b1xx + -0b10000: DATA_DIR, innodb_page_size=16: + invalid (COMPRESSION_LEVEL=8 but COMPRESSION=0) + +0b00111: no DATA_DIR, innodb_page_size=64k: + !!! Misinterpreted as COMPRESSION_LEVEL=3, COMPRESSION=1. + -0b00101: impossible, because sss must be 0 for 16k, not 0b101 + -0b001x0: no DATA_DIR, innodb_page_size=32k or 8k: + invalid (COMPRESSION_LEVEL=3 but COMPRESSION=0) + +0b00000: innodb_page_size=16k (looks like COMPRESSION=0) + ??? Could actually be compressed; see PAGE_SSIZE below */ + const ulint level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101( + flags); + if (FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) != (level != 0) + || level > 9) { + /* The compression flags are not in the buggy MariaDB + 10.1 format. */ + return(ULINT_UNDEFINED); + } + if (!(~flags & FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101)) { + /* The ATOMIC_WRITES flags cannot be 0b11. + (The bits 11..12 should actually never be 0b11, + because in MySQL they would be SHARED|TEMPORARY.) */ + return(ULINT_UNDEFINED); + } + + /* Bits 13..16 are the wrong position for PAGE_SSIZE, and they + should contain one of the values 3,4,6,7, that is, be of the form + 0b0011 or 0b01xx (except 0b0101). + In correct versions, these bits should be 0bc0se + where c is the MariaDB COMPRESSED flag + and e is the MySQL 5.7 ENCRYPTION flag + and s is the MySQL 8.0 SDI flag. MariaDB can only support s=0, e=0. + + Compressed innodb_page_size=16k tables with correct FSP_SPACE_FLAGS + will be properly rejected by older MariaDB 10.1.x because they + would read as PAGE_SSIZE>=8 which is not valid. */ + + const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags); + if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { + /* the page_size is not between 4k and 64k; + 16k should be encoded as 0, not 5 */ + return(ULINT_UNDEFINED); + } + const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + if (zssize == 0) { + /* not ROW_FORMAT=COMPRESSED */ + } else if (zssize > (ssize ? ssize : 5)) { + /* invalid KEY_BLOCK_SIZE */ + return(ULINT_UNDEFINED); + } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { + /* both these flags should be set for + ROW_FORMAT=COMPRESSED */ + return(ULINT_UNDEFINED); + } + + flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE + | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) + << FSP_FLAGS_POS_PAGE_COMPRESSION); + ut_ad(fil_space_t::is_valid_flags(flags, false)); + return(flags); +} + +/** Compare tablespace flags. +@param[in] expected expected flags from dict_tf_to_fsp_flags() +@param[in] actual flags read from FSP_SPACE_FLAGS +@return whether the flags match */ +MY_ATTRIBUTE((warn_unused_result)) +UNIV_INLINE +bool +fsp_flags_match(ulint expected, ulint actual) +{ + expected &= ~FSP_FLAGS_MEM_MASK; + ut_ad(fil_space_t::is_valid_flags(expected, false)); + + if (actual == expected) { + return(true); + } + + actual = fsp_flags_convert_from_101(actual); + return(actual == expected); +} + +/** Determine the descriptor index within a descriptor page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] offset page offset +@return descriptor index */ +inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset) +{ + return ut_2pow_remainder<ulint>(offset, + zip_size ? zip_size : srv_page_size) + / FSP_EXTENT_SIZE; +} + +/** Determine the descriptor page number for a page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] offset page offset +@return descriptor page offset */ +inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset) +{ + compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) + * XDES_SIZE_MAX); + compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) + * XDES_SIZE_MIN); + + ut_ad(srv_page_size > XDES_ARR_OFFSET + + (srv_page_size / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(!zip_size + || zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return ut_2pow_round(offset, + uint32_t(zip_size ? zip_size : srv_page_size)); +} + +#endif /* UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h new file mode 100644 index 00000000..c00c8d68 --- /dev/null +++ b/storage/innobase/include/fsp0space.h @@ -0,0 +1,242 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0space.h +Shared tablespace interface + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#ifndef fsp0space_h +#define fsp0space_h + +#include "fsp0file.h" +#include "fsp0fsp.h" +#include "fsp0types.h" + +#include <vector> + +/** Data structure that contains the information about shared tablespaces. +Currently this can be the system tablespace or a temporary table tablespace */ +class Tablespace { + +public: + typedef std::vector<Datafile, ut_allocator<Datafile> > files_t; + + /** Data file information - each Datafile can be accessed globally */ + files_t m_files; + /** Data file iterator */ + typedef files_t::iterator iterator; + /** Data file iterator */ + typedef files_t::const_iterator const_iterator; + + Tablespace() + : + m_files(), + m_name(), + m_space_id(ULINT_UNDEFINED), + m_path(), + m_flags(), + m_ignore_read_only(false) + { + /* No op */ + } + + virtual ~Tablespace() + { + shutdown(); + ut_ad(m_files.empty()); + ut_ad(m_space_id == ULINT_UNDEFINED); + } + + // Disable copying + Tablespace(const Tablespace&); + Tablespace& operator=(const Tablespace&); + + /** Data file iterator */ + const_iterator begin() const { return m_files.begin(); } + /** Data file iterator */ + const_iterator end() const { return m_files.end(); } + /** Data file iterator */ + iterator begin() { return m_files.begin(); } + /** Data file iterator */ + iterator end() { return m_files.end(); } + + void set_name(const char* name) { m_name = name; } + const char* name() const { return m_name; } + + /** Set tablespace path and filename members. + @param[in] path where tablespace file(s) resides + @param[in] len length of the file path */ + void set_path(const char* path, size_t len) + { + ut_ad(m_path == NULL); + m_path = mem_strdupl(path, len); + ut_ad(m_path != NULL); + + os_normalize_path(m_path); + } + + /** Set tablespace path and filename members. + @param[in] path where tablespace file(s) resides */ + void set_path(const char* path) + { + set_path(path, strlen(path)); + } + + /** Get tablespace path + @return tablespace path */ + const char* path() const + { + return(m_path); + } + + /** Set the space id of the tablespace + @param[in] space_id tablespace ID to set */ + void set_space_id(ulint space_id) + { + ut_ad(m_space_id == ULINT_UNDEFINED); + m_space_id = space_id; + } + + /** Get the space id of the tablespace + @return m_space_id space id of the tablespace */ + ulint space_id() const + { + return(m_space_id); + } + + /** Set the tablespace flags + @param[in] fsp_flags tablespace flags */ + void set_flags(ulint fsp_flags) + { + ut_ad(fil_space_t::is_valid_flags(fsp_flags, false)); + m_flags = fsp_flags; + } + + /** Get the tablespace flags + @return m_flags tablespace flags */ + ulint flags() const + { + return(m_flags); + } + + /** Get the tablespace encryption mode + @return m_mode tablespace encryption mode */ + fil_encryption_t encryption_mode() const + { + return (m_mode); + } + + /** Get the tablespace encryption key_id + @return m_key_id tablespace encryption key_id */ + uint32_t key_id() const + { + return (m_key_id); + } + + /** Set Ignore Read Only Status for tablespace. + @param[in] read_only_status read only status indicator */ + void set_ignore_read_only(bool read_only_status) + { + m_ignore_read_only = read_only_status; + } + + /** Free the memory allocated by the Tablespace object */ + void shutdown(); + + /** @return the sum of the file sizes of each Datafile */ + uint32_t get_sum_of_sizes() const + { + uint32_t sum = 0; + + for (const_iterator it = begin(); it != end(); ++it) { + sum += it->m_size; + } + + return(sum); + } + + /** Open or Create the data files if they do not exist. + @param[in] is_temp whether this is a temporary tablespace + @return DB_SUCCESS or error code */ + dberr_t open_or_create(bool is_temp) + MY_ATTRIBUTE((warn_unused_result)); + + /** Delete all the data files. */ + void delete_files(); + + /** Check if two tablespaces have common data file names. + @param[in] other_space Tablespace to check against this. + @return true if they have the same data filenames and paths */ + bool intersection(const Tablespace* other_space); + + /** Use the ADD DATAFILE path to create a Datafile object and add + it to the front of m_files. Parse the datafile path into a path + and a basename with extension 'ibd'. This datafile_path provided + may be an absolute or relative path, but it must end with the + extension .ibd and have a basename of at least 1 byte. + + Set tablespace m_path member and add a Datafile with the filename. + @param[in] datafile_path full path of the tablespace file. */ + dberr_t add_datafile( + const char* datafile_path); + + /* Return a pointer to the first Datafile for this Tablespace + @return pointer to the first Datafile for this Tablespace*/ + Datafile* first_datafile() + { + ut_a(!m_files.empty()); + return(&m_files.front()); + } +private: + /** + @param[in] filename Name to lookup in the data files. + @return true if the filename exists in the data files */ + bool find(const char* filename) const; + + /** Note that the data file was found. + @param[in] file data file object */ + void file_found(Datafile& file); + + /* DATA MEMBERS */ + + /** Name of the tablespace. */ + const char* m_name; + + /** Tablespace ID */ + ulint m_space_id; + + /** Path where tablespace files will reside, not including a filename.*/ + char* m_path; + + /** Tablespace flags */ + ulint m_flags; + + /** Encryption mode and key_id */ + fil_encryption_t m_mode; + uint32_t m_key_id; + +protected: + /** Ignore server read only configuration for this tablespace. */ + bool m_ignore_read_only; +}; + +#endif /* fsp0space_h */ diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h new file mode 100644 index 00000000..2e0a395f --- /dev/null +++ b/storage/innobase/include/fsp0sysspace.h @@ -0,0 +1,289 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0sysspace.h +Multi file, shared, system tablespace implementation. + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#ifndef fsp0sysspace_h +#define fsp0sysspace_h + +#include "fsp0space.h" + +/** If the last data file is auto-extended, we add this many pages to it +at a time. We have to make this public because it is a config variable. */ +extern uint sys_tablespace_auto_extend_increment; + +/** Data structure that contains the information about shared tablespaces. +Currently this can be the system tablespace or a temporary table tablespace */ +class SysTablespace : public Tablespace +{ +public: + + SysTablespace() + : + m_auto_extend_last_file(), + m_last_file_size_max(), + m_created_new_raw(), + m_is_tablespace_full(false), + m_sanity_checks_done(false) + { + /* No op */ + } + + ~SysTablespace() override + { + shutdown(); + } + + /** Set tablespace full status + @param[in] is_full true if full */ + void set_tablespace_full_status(bool is_full) + { + m_is_tablespace_full = is_full; + } + + /** Get tablespace full status + @return true if table is full */ + bool get_tablespace_full_status() + { + return(m_is_tablespace_full); + } + + /** Set sanity check status + @param[in] status true if sanity checks are done */ + void set_sanity_check_status(bool status) + { + m_sanity_checks_done = status; + } + + /** Get sanity check status + @return true if sanity checks are done */ + bool get_sanity_check_status() + { + return(m_sanity_checks_done); + } + + /** Parse the input params and populate member variables. + @param filepath path to data files + @param supports_raw true if it supports raw devices + @return true on success parse */ + bool parse_params(const char* filepath, bool supports_raw); + + /** Check the data file specification. + @param[out] create_new_db true if a new database + is to be created + @param[in] min_expected_size expected tablespace + size in bytes + @return DB_SUCCESS if all OK else error code */ + dberr_t check_file_spec( + bool* create_new_db, + ulint min_expected_tablespace_size); + + /** Free the memory allocated by parse() */ + void shutdown(); + + /** Normalize the file size, convert to extents. */ + void normalize_size(); + + /** + @return true if a new raw device was created. */ + bool created_new_raw() const + { + return(m_created_new_raw); + } + + /** + @return auto_extend value setting */ + ulint can_auto_extend_last_file() const + { + return(m_auto_extend_last_file); + } + + /** Set the last file size. + @param[in] size the size to set */ + void set_last_file_size(uint32_t size) + { + ut_ad(!m_files.empty()); + m_files.back().m_size = size; + } + + /** Get the size of the last data file in the tablespace + @return the size of the last data file in the array */ + uint32_t last_file_size() const + { + ut_ad(!m_files.empty()); + return(m_files.back().m_size); + } + + /** + @return the autoextend increment in pages. */ + uint32_t get_autoextend_increment() const + { + return sys_tablespace_auto_extend_increment + << (20 - srv_page_size_shift); + } + + /** + @return next increment size */ + uint32_t get_increment() const; + + /** Open or create the data files + @param[in] is_temp whether this is a temporary tablespace + @param[in] create_new_db whether we are creating a new database + @param[out] sum_new_sizes sum of sizes of the new files added + @param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file + @return DB_SUCCESS or error code */ + dberr_t open_or_create( + bool is_temp, + bool create_new_db, + ulint* sum_new_sizes, + lsn_t* flush_lsn) + MY_ATTRIBUTE((warn_unused_result)); + +private: + /** Check the tablespace header for this tablespace. + @param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN + @return DB_SUCCESS or error code */ + dberr_t read_lsn_and_check_flags(lsn_t* flushed_lsn); + + /** + @return true if the last file size is valid. */ + bool is_valid_size() const + { + return(m_last_file_size_max >= last_file_size()); + } + + /** + @return true if configured to use raw devices */ + bool has_raw_device(); + + /** Note that the data file was not found. + @param[in] file data file object + @param[out] create_new_db true if a new instance to be created + @return DB_SUCESS or error code */ + dberr_t file_not_found(Datafile& file, bool* create_new_db); + + /** Note that the data file was found. + @param[in,out] file data file object + @return true if a new instance to be created */ + bool file_found(Datafile& file); + + /** Create a data file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t create(Datafile& file); + + /** Create a data file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t create_file(Datafile& file); + + /** Open a data file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t open_file(Datafile& file); + + /** Set the size of the file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t set_size(Datafile& file); + + /** Convert a numeric string that optionally ends in G or M, to a + number containing megabytes. + @param[in] ptr string with a quantity in bytes + @param[out] megs the number in megabytes + @return next character in string */ + static char* parse_units(char* ptr, ulint* megs); + +private: + enum file_status_t { + FILE_STATUS_VOID = 0, /** status not set */ + FILE_STATUS_RW_PERMISSION_ERROR,/** permission error */ + FILE_STATUS_READ_WRITE_ERROR, /** not readable/writable */ + FILE_STATUS_NOT_REGULAR_FILE_ERROR /** not a regular file */ + }; + + /** Verify the size of the physical file + @param[in] file data file object + @return DB_SUCCESS if OK else error code. */ + dberr_t check_size(Datafile& file); + + /** Check if a file can be opened in the correct mode. + @param[in,out] file data file object + @param[out] reason exact reason if file_status check failed. + @return DB_SUCCESS or error code. */ + dberr_t check_file_status( + const Datafile& file, + file_status_t& reason); + + /* DATA MEMBERS */ + + /** if true, then we auto-extend the last data file */ + bool m_auto_extend_last_file; + + /** maximum size of the last data file (0=unlimited) */ + ulint m_last_file_size_max; + + /** If the following is true we do not allow + inserts etc. This protects the user from forgetting + the 'newraw' keyword to my.cnf */ + bool m_created_new_raw; + + /** Tablespace full status */ + bool m_is_tablespace_full; + + /** if false, then sanity checks are still pending */ + bool m_sanity_checks_done; +}; + +/* GLOBAL OBJECTS */ + +/** The control info of the system tablespace. */ +extern SysTablespace srv_sys_space; + +/** The control info of a temporary table shared tablespace. */ +extern SysTablespace srv_tmp_space; + +/** Check if the space_id is for a system-tablespace (shared + temp). +@param[in] id Space ID to check +@return true if id is a system tablespace, false if not. */ +UNIV_INLINE +bool +is_system_tablespace(ulint id) +{ + return(id == TRX_SYS_SPACE || id == SRV_TMP_SPACE_ID); +} + +/** Check if predefined shared tablespace. +@return true if predefined shared tablespace */ +UNIV_INLINE +bool +is_predefined_tablespace( + ulint id) +{ + ut_ad(srv_sys_space.space_id() == TRX_SYS_SPACE); + ut_ad(TRX_SYS_SPACE == 0); + return(id == TRX_SYS_SPACE + || id == SRV_TMP_SPACE_ID + || srv_is_undo_tablespace(id)); +} +#endif /* fsp0sysspace_h */ diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h new file mode 100644 index 00000000..f8e4c06b --- /dev/null +++ b/storage/innobase/include/fsp0types.h @@ -0,0 +1,405 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/****************************************************** +@file include/fsp0types.h +File space management types + +Created May 26, 2009 Vasil Dimov +*******************************************************/ + +#pragma once +#include <cstddef> + +/** The fil_space_t::id of the redo log. All persistent tablespaces +have a smaller fil_space_t::id. */ +static constexpr size_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0; +/** The fil_space_t::id of the innodb_temporary tablespace. */ +#define SRV_TMP_SPACE_ID 0xFFFFFFFEU + +#include "ut0byte.h" + +/* Possible values of innodb_compression_algorithm */ +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_LZMA_ALGORITHM 4 +#define PAGE_BZIP2_ALGORITHM 5 +#define PAGE_SNAPPY_ALGORITHM 6 +#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM + +/** @name Flags for inserting records in order +If records are inserted in order, there are the following +flags to tell this (their type is made byte for the compiler +to warn if direction and hint parameters are switched in +fseg_alloc_free_page) */ +/* @{ */ +#define FSP_UP ((byte)111) /*!< alphabetically upwards */ +#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */ +#define FSP_NO_DIR ((byte)113) /*!< no order */ +/* @} */ + +/** File space extent size in pages +page size | file space extent size +----------+----------------------- + 4 KiB | 256 pages = 1 MiB + 8 KiB | 128 pages = 1 MiB + 16 KiB | 64 pages = 1 MiB + 32 KiB | 64 pages = 2 MiB + 64 KiB | 64 pages = 4 MiB +*/ +#define FSP_EXTENT_SIZE (srv_page_size_shift < 14 ? \ + (1048576U >> srv_page_size_shift) : 64U) + +/** File space extent size (four megabyte) in pages for MAX page size */ +#define FSP_EXTENT_SIZE_MAX (4194304 / UNIV_PAGE_SIZE_MAX) + +/** File space extent size (one megabyte) in pages for MIN page size */ +#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN) + +/** On a page of any file segment, data may be put starting from this +offset */ +#define FSEG_PAGE_DATA FIL_PAGE_DATA + +/** @name File segment header +The file segment header points to the inode describing the file segment. */ +/* @{ */ +/** Data type for file segment header */ +typedef byte fseg_header_t; + +#define FSEG_HDR_SPACE 0 /*!< space id of the inode */ +#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */ +#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */ + +#define FSEG_HEADER_SIZE 10 /*!< Length of the file system + header, in bytes */ +/* @} */ + +#ifndef UNIV_INNOCHECKSUM +#ifdef UNIV_DEBUG + +struct mtr_t; + +/** A wrapper class to print the file segment header information. */ +class fseg_header +{ +public: + /** Constructor of fseg_header. + @param[in] header the underlying file segment header object + @param[in] mtr the mini-transaction. No redo logs are + generated, only latches are checked within + mini-transaction */ + fseg_header( + const fseg_header_t* header, + mtr_t* mtr) + : + m_header(header), + m_mtr(mtr) + {} + + /** Print the file segment header to the given output stream. + @param[in,out] out the output stream into which the object + is printed. + @retval the output stream into which the object was printed. */ + std::ostream& + to_stream(std::ostream& out) const; +private: + /** The underlying file segment header */ + const fseg_header_t* m_header; + + /** The mini transaction, which is used mainly to check whether + appropriate latches have been taken by the calling thread. */ + mtr_t* m_mtr; +}; + +/* Overloading the global output operator to print a file segment header +@param[in,out] out the output stream into which object will be printed +@param[in] header the file segment header to be printed +@retval the output stream */ +inline +std::ostream& +operator<<( + std::ostream& out, + const fseg_header& header) +{ + return(header.to_stream(out)); +} +#endif /* UNIV_DEBUG */ + +/** Flags for fsp_reserve_free_extents */ +enum fsp_reserve_t { + FSP_NORMAL, /* reservation during normal B-tree operations */ + FSP_UNDO, /* reservation done for undo logging */ + FSP_CLEANING, /* reservation done during purge operations */ + FSP_BLOB /* reservation being done for BLOB insertion */ +}; + +/* Number of pages described in a single descriptor page: currently each page +description takes less than 1 byte; a descriptor page is repeated every +this many file pages */ +/* #define XDES_DESCRIBED_PER_PAGE srv_page_size */ +/* This has been replaced with either srv_page_size or page_zip->size. */ + +/** @name The space low address page map +The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated +every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */ +/* @{ */ +/*--------------------------------------*/ +#define FSP_XDES_OFFSET 0U /* !< extent descriptor */ +#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */ + /* The ibuf bitmap pages are the ones whose + page number is the number above plus a + multiple of XDES_DESCRIBED_PER_PAGE */ + +#define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */ + /* The following pages exist + in the system tablespace (space 0). */ +#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer + header page, in + tablespace 0 */ +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer + B-tree root page in + tablespace 0 */ + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page + number FSP_FIRST_INODE_PAGE_NO */ +#define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction + system header, in + tablespace 0 */ +#define FSP_FIRST_RSEG_PAGE_NO 6U /*!< first rollback segment + page, in tablespace 0 */ +#define FSP_DICT_HDR_PAGE_NO 7U /*!< data dictionary header + page, in tablespace 0 */ +/*--------------------------------------*/ +/* @} */ + +/** Check if tablespace is system temporary. +@param[in] space_id verify is checksum is enabled for given space. +@return true if tablespace is system temporary. */ +inline +bool +fsp_is_system_temporary(ulint space_id) +{ + return(space_id == SRV_TMP_SPACE_ID); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */ + +/** Width of the POST_ANTELOPE flag */ +#define FSP_FLAGS_WIDTH_POST_ANTELOPE 1 +/** Number of flag bits used to indicate the tablespace zip page size */ +#define FSP_FLAGS_WIDTH_ZIP_SSIZE 4 +/** Width of the ATOMIC_BLOBS flag. The ability to break up a long +column into an in-record prefix and an externally stored part is available +to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */ +#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1 +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_WIDTH_PAGE_SSIZE 4 +/** Number of reserved bits */ +#define FSP_FLAGS_WIDTH_RESERVED 6 +/** Number of flag bits used to indicate the page compression */ +#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1 + +/** Width of all the currently known persistent tablespace flags */ +#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_RESERVED \ + + FSP_FLAGS_WIDTH_PAGE_COMPRESSION) + +/** A mask of all the known/used bits in FSP_SPACE_FLAGS */ +#define FSP_FLAGS_MASK (~(~0U << FSP_FLAGS_WIDTH)) + +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE 4 + +/** Marker to indicate whether tablespace is in full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_MARKER 1 + +/** Stores the compressed algo for full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO 3 + +/* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older +and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21 +or newer. +MySQL 5.6 MariaDB 10.1.x MariaDB 10.1.21 +==================================================================== +Below flags in same offset +==================================================================== +0: POST_ANTELOPE 0:POST_ANTELOPE 0: POST_ANTELOPE +1..4: ZIP_SSIZE(0..5) 1..4:ZIP_SSIZE(0..5) 1..4: ZIP_SSIZE(0..5) +(NOTE: bit 4 is always 0) +5: ATOMIC_BLOBS 5:ATOMIC_BLOBS 5: ATOMIC_BLOBS +===================================================================== +Below note the order difference: +===================================================================== +6..9: PAGE_SSIZE(3..7) 6: COMPRESSION 6..9: PAGE_SSIZE(3..7) +10: DATA_DIR 7..10: COMP_LEVEL(0..9) 10: RESERVED (5.6 DATA_DIR) +===================================================================== +The flags below were in incorrect position in MariaDB 10.1, +or have been introduced in MySQL 5.7 or 8.0: +===================================================================== +11: UNUSED 11..12:ATOMIC_WRITES 11: RESERVED (5.7 SHARED) + 12: RESERVED (5.7 TEMPORARY) + 13..15:PAGE_SSIZE(3..7) 13: RESERVED (5.7 ENCRYPTION) + 14: RESERVED (8.0 SDI) + 15: RESERVED + 16: PAGE_SSIZE_msb(0) 16: COMPRESSION + 17: DATA_DIR 17: UNUSED + 18: UNUSED +===================================================================== +The flags below only exist in fil_space_t::flags, not in FSP_SPACE_FLAGS: +===================================================================== + 27: DATA_DIR + 28..31: COMPRESSION_LEVEL +*/ + +/** A mask of the memory-only flags in fil_space_t::flags */ +#define FSP_FLAGS_MEM_MASK (~0U << FSP_FLAGS_MEM_DATA_DIR) + +/** Zero relative shift position of the DATA_DIR flag */ +#define FSP_FLAGS_MEM_DATA_DIR 27 +/** Zero relative shift position of the COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MEM_COMPRESSION_LEVEL 28 + +/** Zero relative shift position of the POST_ANTELOPE field */ +#define FSP_FLAGS_POS_POST_ANTELOPE 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define FSP_FLAGS_POS_ZIP_SSIZE (FSP_FLAGS_POS_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_POST_ANTELOPE) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the start of the PAGE_SSIZE bits */ +#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the RESERVED bits +these are only used in MySQL 5.7 and used for compatibility. */ +#define FSP_FLAGS_POS_RESERVED (FSP_FLAGS_POS_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_RESERVED \ + + FSP_FLAGS_WIDTH_RESERVED) + +/** Zero relative shift position of the PAGE_SIZE field +in full crc32 format */ +#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE 0 + +/** Zero relative shift position of the MARKER field in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_MARKER (FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \ + + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE) + +/** Zero relative shift position of the compressed algorithm stored +in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO (FSP_FLAGS_FCRC32_POS_MARKER \ + + FSP_FLAGS_FCRC32_WIDTH_MARKER) + +/** Bit mask of the POST_ANTELOPE field */ +#define FSP_FLAGS_MASK_POST_ANTELOPE \ + ((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \ + << FSP_FLAGS_POS_POST_ANTELOPE) +/** Bit mask of the ZIP_SSIZE field */ +#define FSP_FLAGS_MASK_ZIP_SSIZE \ + ((~(~0U << FSP_FLAGS_WIDTH_ZIP_SSIZE)) \ + << FSP_FLAGS_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_MASK_ATOMIC_BLOBS \ + ((~(~0U << FSP_FLAGS_WIDTH_ATOMIC_BLOBS)) \ + << FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Bit mask of the PAGE_SSIZE field */ +#define FSP_FLAGS_MASK_PAGE_SSIZE \ + ((~(~0U << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_POS_PAGE_SSIZE) +/** Bit mask of the RESERVED1 field */ +#define FSP_FLAGS_MASK_RESERVED \ + ((~(~0U << FSP_FLAGS_WIDTH_RESERVED)) \ + << FSP_FLAGS_POS_RESERVED) +/** Bit mask of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION \ + ((~(~0U << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \ + << FSP_FLAGS_POS_PAGE_COMPRESSION) + +/** Bit mask of the in-memory COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL \ + (15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL) + +/** Bit mask of the PAGE_SIZE field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + +/** Bit mask of the MARKER field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_MARKER \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER)) \ + << FSP_FLAGS_FCRC32_POS_MARKER) + +/** Bit mask of the COMPRESSED ALGO field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO)) \ + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) + +/** Return the value of the POST_ANTELOPE field */ +#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ + ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \ + >> FSP_FLAGS_POS_POST_ANTELOPE) +/** Return the value of the ZIP_SSIZE field */ +#define FSP_FLAGS_GET_ZIP_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_ZIP_SSIZE) \ + >> FSP_FLAGS_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS) \ + >> FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Return the value of the PAGE_SSIZE field */ +#define FSP_FLAGS_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_POS_PAGE_SSIZE) +/** @return the RESERVED flags */ +#define FSP_FLAGS_GET_RESERVED(flags) \ + ((flags & FSP_FLAGS_MASK_RESERVED) \ + >> FSP_FLAGS_POS_RESERVED) +/** @return the PAGE_COMPRESSION flag */ +#define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** @return the PAGE_SSIZE flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) +/** @return the COMPRESSED_ALGO flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO) \ + >> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) + +/** @return the value of the DATA_DIR field */ +#define FSP_FLAGS_HAS_DATA_DIR(flags) \ + (flags & 1U << FSP_FLAGS_MEM_DATA_DIR) +/** @return the COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_MEM_COMPRESSION_LEVEL) + +/* @} */ + +struct fil_node_t; +struct fil_space_t; +class buf_page_t; diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h new file mode 100644 index 00000000..15bf30bc --- /dev/null +++ b/storage/innobase/include/fts0ast.h @@ -0,0 +1,340 @@ +/***************************************************************************** + +Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0ast.h +The FTS query parser (AST) abstract syntax tree routines + +Created 2007/03/16/03 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FST0AST_H +#define INNOBASE_FST0AST_H + +#include "mem0mem.h" + +/* The type of AST Node */ +enum fts_ast_type_t { + FTS_AST_OPER, /*!< Operator */ + FTS_AST_NUMB, /*!< Number */ + FTS_AST_TERM, /*!< Term (or word) */ + FTS_AST_TEXT, /*!< Text string */ + FTS_AST_PARSER_PHRASE_LIST, /*!< Phase for plugin parser + The difference from text type + is that we tokenize text into + term list */ + FTS_AST_LIST, /*!< Expression list */ + FTS_AST_SUBEXP_LIST /*!< Sub-Expression list */ +}; + +/* The FTS query operators that we support */ +enum fts_ast_oper_t { + FTS_NONE, /*!< No operator */ + + FTS_IGNORE, /*!< Ignore rows that contain + this word */ + + FTS_EXIST, /*!< Include rows that contain + this word */ + + FTS_NEGATE, /*!< Include rows that contain + this word but rank them + lower*/ + + FTS_INCR_RATING, /*!< Increase the rank for this + word*/ + + FTS_DECR_RATING, /*!< Decrease the rank for this + word*/ + + FTS_DISTANCE, /*!< Proximity distance */ + FTS_IGNORE_SKIP, /*!< Transient node operator + signifies that this is a + FTS_IGNORE node, and ignored in + the first pass of + fts_ast_visit() */ + FTS_EXIST_SKIP /*!< Transient node operator + signifies that this ia a + FTS_EXIST node, and ignored in + the first pass of + fts_ast_visit() */ +}; + +/* Data types used by the FTS parser */ +struct fts_lexer_t; +struct fts_ast_node_t; +struct fts_ast_state_t; +struct fts_ast_string_t; + +typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*); + +/******************************************************************** +Parse the string using the lexer setup within state.*/ +int +fts_parse( +/*======*/ + /* out: 0 on OK, 1 on error */ + fts_ast_state_t* state); /*!< in: ast state instance.*/ + +/******************************************************************** +Create an AST operator node */ +extern +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_oper_t oper); /*!< in: ast operator */ +/******************************************************************** +Create an AST term node, makes a copy of ptr */ +extern +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: term string */ +/******************************************************************** +Create an AST text node */ +extern +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: text string */ +/******************************************************************** +Create an AST expr list node */ +extern +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_node_t* expr); /*!< in: ast expr */ +/******************************************************************** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. */ +extern +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + /* out: new node */ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr); /*!< in: ast expr instance */ +/******************************************************************** +Set the wildcard attribute of a term.*/ +extern +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node); /*!< in: term to change */ +/******************************************************************** +Set the proximity attribute of a text node. */ +void +fts_ast_text_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance); /*!< in: the text proximity + distance */ +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node); /*!< in: node to free */ +/******************************************************************** +Add a sub-expression to an AST*/ +extern +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* list, /*!< in: list node instance */ + fts_ast_node_t* node); /*!< in: (sub) expr to add */ +/******************************************************************** +Print the AST node recursively.*/ +extern +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node); /*!< in: ast node to print */ +/******************************************************************** +Free node and expr allocations.*/ +extern +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state); /*!< in: state instance + to free */ +/** Check only union operation involved in the node +@param[in] node ast node to check +@return true if the node contains only union else false. */ +bool +fts_ast_node_check_union( + fts_ast_node_t* node); + +/******************************************************************//** +Traverse the AST - in-order traversal. +@return DB_SUCCESS if all went well */ +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: FTS operator */ + fts_ast_node_t* node, /*!< in: instance to traverse*/ + fts_ast_callback visitor, /*!< in: callback */ + void* arg, /*!< in: callback arg */ + bool* has_ignore) /*!< out: whether we encounter + and ignored processing an + operator, currently we only + ignore FTS_IGNORE operator */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************** +Create a lex instance.*/ +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, /*!< in: query type */ + const byte* query, /*!< in: query string */ + ulint query_len) /*!< in: query string len */ + MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); +/******************************************************************** +Free an fts_lexer_t instance.*/ +void +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) /*!< in: lexer instance to + free */ + MY_ATTRIBUTE((nonnull)); + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len); + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +void +fts_ast_string_free( + fts_ast_string_t* ast_str); + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base); + +/* String of length len. +We always store the string of length len with a terminating '\0', +regardless of there is any 0x00 in the string itself */ +struct fts_ast_string_t { + /*!< Pointer to string. */ + byte* str; + + /*!< Length of the string. */ + ulint len; +}; + +/* Query term type */ +struct fts_ast_term_t { + fts_ast_string_t* ptr; /*!< Pointer to term string.*/ + ibool wildcard; /*!< TRUE if wild card set.*/ +}; + +/* Query text type */ +struct fts_ast_text_t { + fts_ast_string_t* ptr; /*!< Pointer to text string.*/ + ulint distance; /*!< > 0 if proximity distance + set */ +}; + +/* The list of nodes in an expr list */ +struct fts_ast_list_t { + fts_ast_node_t* head; /*!< Children list head */ + fts_ast_node_t* tail; /*!< Children list tail */ +}; + +/* FTS AST node to store the term, text, operator and sub-expressions.*/ +struct fts_ast_node_t { + fts_ast_type_t type; /*!< The type of node */ + fts_ast_text_t text; /*!< Text node */ + fts_ast_term_t term; /*!< Term node */ + fts_ast_oper_t oper; /*!< Operator value */ + fts_ast_list_t list; /*!< Expression list */ + fts_ast_node_t* next; /*!< Link for expr list */ + fts_ast_node_t* next_alloc; /*!< For tracking allocations */ + bool visited; /*!< whether this node is + already processed */ + /** current transaction */ + const trx_t* trx; + /* Used by plugin parser */ + fts_ast_node_t* up_node; /*!< Direct up node */ + bool go_up; /*!< Flag if go one level up */ +}; + +/* To track state during parsing */ +struct fts_ast_state_t { + mem_heap_t* heap; /*!< Heap to use for alloc */ + fts_ast_node_t* root; /*!< If all goes OK, then this + will point to the root.*/ + + fts_ast_list_t list; /*!< List of nodes allocated */ + + fts_lexer_t* lexer; /*!< Lexer callback + arg */ + CHARSET_INFO* charset; /*!< charset used for + tokenization */ + /* Used by plugin parser */ + fts_ast_node_t* cur_node; /*!< Current node into which + we add new node */ + int depth; /*!< Depth of parsing state */ +}; + +/******************************************************************//** +Create an AST term node, makes a copy of ptr for plugin parser +@return node */ +extern +fts_ast_node_t* +fts_ast_create_node_term_for_parser( +/*==========i=====================*/ + void* arg, /*!< in: ast state */ + const char* ptr, /*!< in: term string */ + const ulint len); /*!< in: term string length */ + +/******************************************************************//** +Create an AST phrase list node for plugin parser +@return node */ +extern +fts_ast_node_t* +fts_ast_create_node_phrase_list( +/*============================*/ + void* arg); /*!< in: ast state */ + +#ifdef UNIV_DEBUG +const char* +fts_ast_node_type_get(fts_ast_type_t type); +#endif /* UNIV_DEBUG */ + +#endif /* INNOBASE_FSTS0AST_H */ diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h new file mode 100644 index 00000000..b16e7f2c --- /dev/null +++ b/storage/innobase/include/fts0blex.h @@ -0,0 +1,702 @@ +#ifndef fts0bHEADER_H +#define fts0bHEADER_H 1 +#define fts0bIN_HEADER 1 + +#line 6 "../include/fts0blex.h" + +#line 8 "../include/fts0blex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define fts0b_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer fts0b_create_buffer +#endif + +#ifdef yy_delete_buffer +#define fts0b_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer fts0b_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define fts0b_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer fts0b_scan_buffer +#endif + +#ifdef yy_scan_string +#define fts0b_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string fts0b_scan_string +#endif + +#ifdef yy_scan_bytes +#define fts0b_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes fts0b_scan_bytes +#endif + +#ifdef yy_init_buffer +#define fts0b_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer fts0b_init_buffer +#endif + +#ifdef yy_flush_buffer +#define fts0b_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer fts0b_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define fts0b_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state fts0b_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define fts0b_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer fts0b_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define fts0bpush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state fts0bpush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define fts0bpop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state fts0bpop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define fts0bensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack fts0bensure_buffer_stack +#endif + +#ifdef yylex +#define fts0blex_ALREADY_DEFINED +#else +#define yylex fts0blex +#endif + +#ifdef yyrestart +#define fts0brestart_ALREADY_DEFINED +#else +#define yyrestart fts0brestart +#endif + +#ifdef yylex_init +#define fts0blex_init_ALREADY_DEFINED +#else +#define yylex_init fts0blex_init +#endif + +#ifdef yylex_init_extra +#define fts0blex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra fts0blex_init_extra +#endif + +#ifdef yylex_destroy +#define fts0blex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy fts0blex_destroy +#endif + +#ifdef yyget_debug +#define fts0bget_debug_ALREADY_DEFINED +#else +#define yyget_debug fts0bget_debug +#endif + +#ifdef yyset_debug +#define fts0bset_debug_ALREADY_DEFINED +#else +#define yyset_debug fts0bset_debug +#endif + +#ifdef yyget_extra +#define fts0bget_extra_ALREADY_DEFINED +#else +#define yyget_extra fts0bget_extra +#endif + +#ifdef yyset_extra +#define fts0bset_extra_ALREADY_DEFINED +#else +#define yyset_extra fts0bset_extra +#endif + +#ifdef yyget_in +#define fts0bget_in_ALREADY_DEFINED +#else +#define yyget_in fts0bget_in +#endif + +#ifdef yyset_in +#define fts0bset_in_ALREADY_DEFINED +#else +#define yyset_in fts0bset_in +#endif + +#ifdef yyget_out +#define fts0bget_out_ALREADY_DEFINED +#else +#define yyget_out fts0bget_out +#endif + +#ifdef yyset_out +#define fts0bset_out_ALREADY_DEFINED +#else +#define yyset_out fts0bset_out +#endif + +#ifdef yyget_leng +#define fts0bget_leng_ALREADY_DEFINED +#else +#define yyget_leng fts0bget_leng +#endif + +#ifdef yyget_text +#define fts0bget_text_ALREADY_DEFINED +#else +#define yyget_text fts0bget_text +#endif + +#ifdef yyget_lineno +#define fts0bget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno fts0bget_lineno +#endif + +#ifdef yyset_lineno +#define fts0bset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno fts0bset_lineno +#endif + +#ifdef yyget_column +#define fts0bget_column_ALREADY_DEFINED +#else +#define yyget_column fts0bget_column +#endif + +#ifdef yyset_column +#define fts0bset_column_ALREADY_DEFINED +#else +#define yyset_column fts0bset_column +#endif + +#ifdef yywrap +#define fts0bwrap_ALREADY_DEFINED +#else +#define yywrap fts0bwrap +#endif + +#ifdef yyalloc +#define fts0balloc_ALREADY_DEFINED +#else +#define yyalloc fts0balloc +#endif + +#ifdef yyrealloc +#define fts0brealloc_ALREADY_DEFINED +#else +#define yyrealloc fts0brealloc +#endif + +#ifdef yyfree +#define fts0bfree_ALREADY_DEFINED +#else +#define yyfree fts0bfree +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void yyrestart ( FILE *input_file , yyscan_t yyscanner ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner ); +void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +void yypop_buffer_state ( yyscan_t yyscanner ); + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner ); + +void *yyalloc ( yy_size_t , yyscan_t yyscanner ); +void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner ); +void yyfree ( void * , yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0bwrap(yyscanner) (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( yyscan_t yyscanner ); + +int yyget_debug ( yyscan_t yyscanner ); + +void yyset_debug ( int debug_flag , yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner ); + +FILE *yyget_in ( yyscan_t yyscanner ); + +void yyset_in ( FILE * _in_str , yyscan_t yyscanner ); + +FILE *yyget_out ( yyscan_t yyscanner ); + +void yyset_out ( FILE * _out_str , yyscan_t yyscanner ); + + int yyget_leng ( yyscan_t yyscanner ); + +char *yyget_text ( yyscan_t yyscanner ); + +int yyget_lineno ( yyscan_t yyscanner ); + +void yyset_lineno ( int _line_number , yyscan_t yyscanner ); + +int yyget_column ( yyscan_t yyscanner ); + +void yyset_column ( int _column_no , yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( yyscan_t yyscanner ); +#else +extern int yywrap ( yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * , yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#ifndef fts0b_create_buffer_ALREADY_DEFINED +#undef yy_create_buffer +#endif +#ifndef fts0b_delete_buffer_ALREADY_DEFINED +#undef yy_delete_buffer +#endif +#ifndef fts0b_scan_buffer_ALREADY_DEFINED +#undef yy_scan_buffer +#endif +#ifndef fts0b_scan_string_ALREADY_DEFINED +#undef yy_scan_string +#endif +#ifndef fts0b_scan_bytes_ALREADY_DEFINED +#undef yy_scan_bytes +#endif +#ifndef fts0b_init_buffer_ALREADY_DEFINED +#undef yy_init_buffer +#endif +#ifndef fts0b_flush_buffer_ALREADY_DEFINED +#undef yy_flush_buffer +#endif +#ifndef fts0b_load_buffer_state_ALREADY_DEFINED +#undef yy_load_buffer_state +#endif +#ifndef fts0b_switch_to_buffer_ALREADY_DEFINED +#undef yy_switch_to_buffer +#endif +#ifndef fts0bpush_buffer_state_ALREADY_DEFINED +#undef yypush_buffer_state +#endif +#ifndef fts0bpop_buffer_state_ALREADY_DEFINED +#undef yypop_buffer_state +#endif +#ifndef fts0bensure_buffer_stack_ALREADY_DEFINED +#undef yyensure_buffer_stack +#endif +#ifndef fts0blex_ALREADY_DEFINED +#undef yylex +#endif +#ifndef fts0brestart_ALREADY_DEFINED +#undef yyrestart +#endif +#ifndef fts0blex_init_ALREADY_DEFINED +#undef yylex_init +#endif +#ifndef fts0blex_init_extra_ALREADY_DEFINED +#undef yylex_init_extra +#endif +#ifndef fts0blex_destroy_ALREADY_DEFINED +#undef yylex_destroy +#endif +#ifndef fts0bget_debug_ALREADY_DEFINED +#undef yyget_debug +#endif +#ifndef fts0bset_debug_ALREADY_DEFINED +#undef yyset_debug +#endif +#ifndef fts0bget_extra_ALREADY_DEFINED +#undef yyget_extra +#endif +#ifndef fts0bset_extra_ALREADY_DEFINED +#undef yyset_extra +#endif +#ifndef fts0bget_in_ALREADY_DEFINED +#undef yyget_in +#endif +#ifndef fts0bset_in_ALREADY_DEFINED +#undef yyset_in +#endif +#ifndef fts0bget_out_ALREADY_DEFINED +#undef yyget_out +#endif +#ifndef fts0bset_out_ALREADY_DEFINED +#undef yyset_out +#endif +#ifndef fts0bget_leng_ALREADY_DEFINED +#undef yyget_leng +#endif +#ifndef fts0bget_text_ALREADY_DEFINED +#undef yyget_text +#endif +#ifndef fts0bget_lineno_ALREADY_DEFINED +#undef yyget_lineno +#endif +#ifndef fts0bset_lineno_ALREADY_DEFINED +#undef yyset_lineno +#endif +#ifndef fts0bget_column_ALREADY_DEFINED +#undef yyget_column +#endif +#ifndef fts0bset_column_ALREADY_DEFINED +#undef yyset_column +#endif +#ifndef fts0bwrap_ALREADY_DEFINED +#undef yywrap +#endif +#ifndef fts0bget_lval_ALREADY_DEFINED +#undef yyget_lval +#endif +#ifndef fts0bset_lval_ALREADY_DEFINED +#undef yyset_lval +#endif +#ifndef fts0bget_lloc_ALREADY_DEFINED +#undef yyget_lloc +#endif +#ifndef fts0bset_lloc_ALREADY_DEFINED +#undef yyset_lloc +#endif +#ifndef fts0balloc_ALREADY_DEFINED +#undef yyalloc +#endif +#ifndef fts0brealloc_ALREADY_DEFINED +#undef yyrealloc +#endif +#ifndef fts0bfree_ALREADY_DEFINED +#undef yyfree +#endif +#ifndef fts0btext_ALREADY_DEFINED +#undef yytext +#endif +#ifndef fts0bleng_ALREADY_DEFINED +#undef yyleng +#endif +#ifndef fts0bin_ALREADY_DEFINED +#undef yyin +#endif +#ifndef fts0bout_ALREADY_DEFINED +#undef yyout +#endif +#ifndef fts0b_flex_debug_ALREADY_DEFINED +#undef yy_flex_debug +#endif +#ifndef fts0blineno_ALREADY_DEFINED +#undef yylineno +#endif +#ifndef fts0btables_fload_ALREADY_DEFINED +#undef yytables_fload +#endif +#ifndef fts0btables_destroy_ALREADY_DEFINED +#undef yytables_destroy +#endif +#ifndef fts0bTABLES_NAME_ALREADY_DEFINED +#undef yyTABLES_NAME +#endif + +#line 74 "fts0blex.l" + + +#line 701 "../include/fts0blex.h" +#undef fts0bIN_HEADER +#endif /* fts0bHEADER_H */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h new file mode 100644 index 00000000..906ece2e --- /dev/null +++ b/storage/innobase/include/fts0fts.h @@ -0,0 +1,976 @@ +/***************************************************************************** + +Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0fts.h +Full text search header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#pragma once + +#include "data0type.h" +#include "data0types.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" +#include "ut0rbt.h" +#include "ut0wqueue.h" +#include "que0types.h" +#include "ft_global.h" +#include "mysql/plugin_ftparser.h" + +/** "NULL" value of a document id. */ +#define FTS_NULL_DOC_ID 0 + +/** FTS hidden column that is used to map to and from the row */ +#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID" + +/** The name of the index created by FTS */ +#define FTS_DOC_ID_INDEX_NAME "FTS_DOC_ID_INDEX" + +#define FTS_DOC_ID_INDEX_NAME_LEN 16 + +/** Doc ID is a 8 byte value */ +#define FTS_DOC_ID_LEN 8 + +/** The number of fields to sort when we build FT index with +FIC. Three fields are sort: (word, doc_id, position) */ +#define FTS_NUM_FIELDS_SORT 3 + +/** Maximum number of rows in a table, smaller than which, we will +optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */ +#define MAX_DOC_ID_OPT_VAL 1073741824 + +/** Document id type. */ +typedef ib_id_t doc_id_t; + +/** doc_id_t printf format */ +#define FTS_DOC_ID_FORMAT IB_ID_FMT + +/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */ +#define fts_write_doc_id(d, s) mach_write_to_8(d, s) + +/** Read a document id to internal format. */ +#define fts_read_doc_id(s) mach_read_from_8(s) + +/** Bind the doc id to a variable */ +#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v) + +/** Defines for FTS query mode, they have the same values as +those defined in mysql file ft_global.h */ +#define FTS_NL 0 +#define FTS_BOOL 1 +#define FTS_SORTED 2 +#define FTS_EXPAND 4 +#define FTS_NO_RANKING 8 +#define FTS_PROXIMITY 16 +#define FTS_PHRASE 32 +#define FTS_OPT_RANKING 64 + +#define FTS_INDEX_TABLE_IND_NAME "FTS_INDEX_TABLE_IND" + +/** The number of FTS index partitions for a fulltext idnex */ +#define FTS_NUM_AUX_INDEX 6 + +/** Threshold where our optimize thread automatically kicks in */ +#define FTS_OPTIMIZE_THRESHOLD 10000000 + +/** Threshold to avoid exhausting of doc ids. Consecutive doc id difference +should not exceed FTS_DOC_ID_MAX_STEP */ +#define FTS_DOC_ID_MAX_STEP 65535 + +/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */ +#define FTS_MAX_WORD_LEN (HA_FT_MAXCHARLEN * 4) + +/** Maximum possible Fulltext word length (in characters) */ +#define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN + +/** Number of columns in FTS AUX Tables */ +#define FTS_DELETED_TABLE_NUM_COLS 1 +#define FTS_CONFIG_TABLE_NUM_COLS 2 +#define FTS_AUX_INDEX_TABLE_NUM_COLS 5 + +/** DELETED_TABLE(doc_id BIGINT UNSIGNED) */ +#define FTS_DELETED_TABLE_COL_LEN 8 +/** CONFIG_TABLE(key CHAR(50), value CHAR(200)) */ +#define FTS_CONFIG_TABLE_KEY_COL_LEN 50 +#define FTS_CONFIG_TABLE_VALUE_COL_LEN 200 + +#define FTS_INDEX_FIRST_DOC_ID_LEN 8 +#define FTS_INDEX_LAST_DOC_ID_LEN 8 +#define FTS_INDEX_DOC_COUNT_LEN 4 +/* BLOB COLUMN, 0 means VARIABLE SIZE */ +#define FTS_INDEX_ILIST_LEN 0 + + +/** Variable specifying the FTS parallel sort degree */ +extern ulong fts_sort_pll_degree; + +/** Variable specifying the number of word to optimize for each optimize table +call */ +extern ulong fts_num_word_optimize; + +/** Variable specifying whether we do additional FTS diagnostic printout +in the log */ +extern char fts_enable_diag_print; + +/** FTS rank type, which will be between 0 .. 1 inclusive */ +typedef float fts_rank_t; + +/** Type of a row during a transaction. FTS_NOTHING means the row can be +forgotten from the FTS system's POV, FTS_INVALID is an internal value used +to mark invalid states. + +NOTE: Do not change the order or value of these, fts_trx_row_get_new_state +depends on them being exactly as they are. */ +enum fts_row_state { + FTS_INSERT = 0, + FTS_MODIFY, + FTS_DELETE, + FTS_NOTHING, + FTS_INVALID +}; + +/** The FTS table types. */ +enum fts_table_type_t { + FTS_INDEX_TABLE, /*!< FTS auxiliary table that is + specific to a particular FTS index + on a table */ + + FTS_COMMON_TABLE /*!< FTS auxiliary table that is common + for all FTS index on a table */ +}; + +struct fts_doc_t; +struct fts_cache_t; +struct fts_token_t; +struct fts_doc_ids_t; +struct fts_index_cache_t; + + +/** Initialize the "fts_table" for internal query into FTS auxiliary +tables */ +#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_table->id; \ + (fts_table)->table = m_table; \ +} while (0); + +#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_index->table->id; \ + (fts_table)->table = m_index->table; \ + (fts_table)->index_id = m_index->id; \ +} while (0); + +/** Information about changes in a single transaction affecting +the FTS system. */ +struct fts_trx_t { + trx_t* trx; /*!< InnoDB transaction */ + + ib_vector_t* savepoints; /*!< Active savepoints, must have at + least one element, the implied + savepoint */ + ib_vector_t* last_stmt; /*!< last_stmt */ + + mem_heap_t* heap; /*!< heap */ +}; + +/** Information required for transaction savepoint handling. */ +struct fts_savepoint_t { + char* name; /*!< First entry is always NULL, the + default instance. Otherwise the name + of the savepoint */ + + ib_rbt_t* tables; /*!< Modified FTS tables */ +}; + +/** Information about changed rows in a transaction for a single table. */ +struct fts_trx_table_t { + dict_table_t* table; /*!< table */ + + fts_trx_t* fts_trx; /*!< link to parent */ + + ib_rbt_t* rows; /*!< rows changed; indexed by doc-id, + cells are fts_trx_row_t* */ + + fts_doc_ids_t* added_doc_ids; /*!< list of added doc ids (NULL until + the first addition) */ + + /*!< for adding doc ids */ + que_t* docs_added_graph; +}; + +/** Information about one changed row in a transaction. */ +struct fts_trx_row_t { + doc_id_t doc_id; /*!< Id of the ins/upd/del document */ + + fts_row_state state; /*!< state of the row */ + + ib_vector_t* fts_indexes; /*!< The indexes that are affected */ +}; + +/** List of document ids that were added during a transaction. This +list is passed on to a background 'Add' thread and OPTIMIZE, so it +needs its own memory heap. */ +struct fts_doc_ids_t { + ib_vector_t* doc_ids; /*!< document ids (each element is + of type doc_id_t). */ + + ib_alloc_t* self_heap; /*!< Allocator used to create an + instance of this type and the + doc_ids vector */ +}; + +// FIXME: Get rid of this if possible. +/** Since MySQL's character set support for Unicode is woefully inadequate +(it supports basic operations like isalpha etc. only for 8-bit characters), +we have to implement our own. We use UTF-16 without surrogate processing +as our in-memory format. This typedef is a single such character. */ +typedef unsigned short ib_uc_t; + +/** An UTF-16 ro UTF-8 string. */ +struct fts_string_t { + byte* f_str; /*!< string, not necessary terminated in + any way */ + ulint f_len; /*!< Length of the string in bytes */ + ulint f_n_char; /*!< Number of characters */ +}; + +/** Query ranked doc ids. */ +struct fts_ranking_t { + doc_id_t doc_id; /*!< Document id */ + + fts_rank_t rank; /*!< Rank is between 0 .. 1 */ + + byte* words; /*!< this contains the words + that were queried + and found in this document */ + ulint words_len; /*!< words len */ +}; + +/** Query result. */ +struct fts_result_t { + ib_rbt_node_t* current; /*!< Current element */ + + ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t + indexed by doc id */ + ib_rbt_t* rankings_by_rank;/*!< RB tree of type fts_ranking_t + indexed by rank */ +}; + +/** This is used to generate the FTS auxiliary table name, we need the +table id and the index id to generate the column specific FTS auxiliary +table name. */ +struct fts_table_t { + fts_table_type_t + type; /*!< The auxiliary table type */ + + table_id_t table_id; /*!< The table id */ + + index_id_t index_id; /*!< The index id */ + + const char* suffix; /*!< The suffix of the fts auxiliary + table name, can be NULL, not used + everywhere (yet) */ + const dict_table_t* + table; /*!< Parent table */ + CHARSET_INFO* charset; /*!< charset info if it is for FTS + index auxiliary table */ +}; + +/** The state of the FTS sub system. */ +class fts_t { +public: + /** fts_t constructor. + @param[in] table table with FTS indexes + @param[in,out] heap memory heap where 'this' is stored */ + fts_t( + const dict_table_t* table, + mem_heap_t* heap); + + /** fts_t destructor. */ + ~fts_t(); + + /** Whether the ADDED table record sync-ed after crash recovery */ + unsigned added_synced:1; + /** Whether the table holds dict_sys.mutex */ + unsigned dict_locked:1; + + /** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL + if the thread has not yet been created. Each work item is a + fts_trx_doc_ids_t*. */ + ib_wqueue_t* add_wq; + + /** FTS memory buffer for this table, or NULL if the table has no FTS + index. */ + fts_cache_t* cache; + + /** FTS doc id hidden column number in the CLUSTERED index. */ + ulint doc_col; + + /** Vector of FTS indexes, this is mainly for caching purposes. */ + ib_vector_t* indexes; + + /** Whether the table exists in fts_optimize_wq; + protected by fts_optimize_wq mutex */ + bool in_queue; + + /** Whether the sync message exists in fts_optimize_wq; + protected by fts_optimize_wq mutex */ + bool sync_message; + + /** Heap for fts_t allocation. */ + mem_heap_t* fts_heap; +}; + +struct fts_stopword_t; + +/** status bits for fts_stopword_t status field. */ +#define STOPWORD_NOT_INIT 0x1 +#define STOPWORD_OFF 0x2 +#define STOPWORD_FROM_DEFAULT 0x4 +#define STOPWORD_USER_TABLE 0x8 + +extern const char* fts_default_stopword[]; + +/** Variable specifying the maximum FTS cache size for each table */ +extern ulong fts_max_cache_size; + +/** Variable specifying the total memory allocated for FTS cache */ +extern ulong fts_max_total_cache_size; + +/** Variable specifying the FTS result cache limit for each query */ +extern size_t fts_result_cache_limit; + +/** Variable specifying the maximum FTS max token size */ +extern ulong fts_max_token_size; + +/** Variable specifying the minimum FTS max token size */ +extern ulong fts_min_token_size; + +/** Whether the total memory used for FTS cache is exhausted, and we will +need a sync to free some memory */ +extern bool fts_need_sync; + +#define fts_que_graph_free(graph) \ +do { \ + mutex_enter(&dict_sys.mutex); \ + que_graph_free(graph); \ + mutex_exit(&dict_sys.mutex); \ +} while (0) + +/******************************************************************//** +Create a FTS cache. */ +fts_cache_t* +fts_cache_create( +/*=============*/ + dict_table_t* table); /*!< table owns the FTS cache */ + +/******************************************************************//** +Create a FTS index cache. +@return Index Cache */ +fts_index_cache_t* +fts_cache_index_cache_create( +/*=========================*/ + dict_table_t* table, /*!< in: table with FTS index */ + dict_index_t* index); /*!< in: FTS index */ + +/******************************************************************//** +Get the next available document id. This function creates a new +transaction to generate the document id. +@return DB_SUCCESS if OK */ +dberr_t +fts_get_next_doc_id( +/*================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id);/*!< out: new document id */ +/*********************************************************************//** +Update the next and last Doc ID in the CONFIG table to be the input +"doc_id" value (+ 1). We would do so after each FTS index build or +table truncate */ +void +fts_update_next_doc_id( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + const dict_table_t* table, /*!< in: table */ + doc_id_t doc_id) /*!< in: DOC ID to set */ + MY_ATTRIBUTE((nonnull(2))); + +/******************************************************************//** +Create a new fts_doc_ids_t. +@return new fts_doc_ids_t. */ +fts_doc_ids_t* +fts_doc_ids_create(void); +/*=====================*/ + +/** Free fts_doc_ids_t */ +inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids) +{ + mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg)); +} + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +void +fts_trx_add_op( +/*===========*/ + trx_t* trx, /*!< in: InnoDB transaction */ + dict_table_t* table, /*!< in: table */ + doc_id_t doc_id, /*!< in: doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes); /*!< in: FTS indexes affected + (NULL=all) */ + +/******************************************************************//** +Free an FTS trx. */ +void +fts_trx_free( +/*=========*/ + fts_trx_t* fts_trx); /*!< in, own: FTS trx */ + +/** Creates the common auxiliary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been called +before this. +The following tables are created. +CREATE TABLE $FTS_PREFIX_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_CONFIG + (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key) +@param[in,out] trx transaction +@param[in] table table with FTS index +@param[in] skip_doc_id_index Skip index on doc id +@return DB_SUCCESS if succeed */ +dberr_t +fts_create_common_tables( + trx_t* trx, + dict_table_t* table, + bool skip_doc_id_index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. row_mysql_lock_data_dictionary must have +been called before this. + +All FTS AUX Index tables have the following schema. +CREAT TABLE $FTS_PREFIX_INDEX_[1-6]( + word VARCHAR(FTS_MAX_WORD_LEN), + first_doc_id INT NOT NULL, + last_doc_id UNSIGNED NOT NULL, + doc_count UNSIGNED INT NOT NULL, + ilist VARBINARY NOT NULL, + UNIQUE CLUSTERED INDEX ON (word, first_doc_id)) +@param[in,out] trx dictionary transaction +@param[in] index fulltext index +@param[in] id table id +@return DB_SUCCESS or error code */ +dberr_t +fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Add the FTS document id hidden column. */ +void +fts_add_doc_id_column( +/*==================*/ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap); /*!< in: temporary memory heap, or NULL */ + +/*********************************************************************//** +Drops the ancillary tables needed for supporting an FTS index on the +given table. row_mysql_lock_data_dictionary must have been called before +this. +@return DB_SUCCESS or error code */ +dberr_t +fts_drop_tables( +/*============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table); /*!< in: table has the FTS + index */ +/******************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +dberr_t +fts_commit( +/*=======*/ + trx_t* trx) /*!< in: transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/** FTS Query entry point. +@param[in,out] trx transaction +@param[in] index fts index to search +@param[in] flags FTS search mode +@param[in] query_str FTS query +@param[in] query_len FTS query string len in bytes +@param[in,out] result result doc ids +@return DB_SUCCESS if successful otherwise error code */ +dberr_t +fts_query( + trx_t* trx, + dict_index_t* index, + uint flags, + const byte* query_str, + ulint query_len, + fts_result_t** result) + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value. */ +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id); /*!< in: the interested document + doc_id */ + +/******************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result); /*!< out: result instance + to sort.*/ + +/******************************************************************//** +FTS Query free result, returned by fts_query(). */ +void +fts_query_free_result( +/*==================*/ + fts_result_t* result); /*!< in: result instance + to free.*/ + +/******************************************************************//** +Extract the doc id from the FTS hidden column. */ +doc_id_t +fts_get_doc_id_from_row( +/*====================*/ + dict_table_t* table, /*!< in: table */ + dtuple_t* row); /*!< in: row whose FTS doc id we + want to extract.*/ + +/** Extract the doc id from the record that belongs to index. +@param[in] rec record containing FTS_DOC_ID +@param[in] index index of rec +@param[in] offsets rec_get_offsets(rec,index) +@return doc id that was extracted from rec */ +doc_id_t +fts_get_doc_id_from_rec( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets); + +/** Add new fts doc id to the update vector. +@param[in] table the table that contains the FTS index. +@param[in,out] ufield the fts doc id field in the update vector. + No new memory is allocated for this in this + function. +@param[in,out] next_doc_id the fts doc id that has been added to the + update vector. If 0, a new fts doc id is + automatically generated. The memory provided + for this argument will be used by the update + vector. Ensure that the life time of this + memory matches that of the update vector. +@return the fts doc id used in the update vector */ +doc_id_t +fts_update_doc_id( + dict_table_t* table, + upd_field_t* ufield, + doc_id_t* next_doc_id); + +/******************************************************************//** +FTS initialize. */ +void +fts_startup(void); +/*==============*/ + +/******************************************************************//** +Create an instance of fts_t. +@return instance of fts_t */ +fts_t* +fts_create( +/*=======*/ + dict_table_t* table); /*!< out: table with FTS + indexes */ + +/**********************************************************************//** +Free the FTS resources. */ +void +fts_free( +/*=====*/ + dict_table_t* table); /*!< in/out: table with + FTS indexes */ + +/*********************************************************************//** +Run OPTIMIZE on the given table. +@return DB_SUCCESS if all OK */ +dberr_t +fts_optimize_table( +/*===============*/ + dict_table_t* table); /*!< in: table to optimiza */ + +/**********************************************************************//** +Startup the optimize thread and create the work queue. */ +void +fts_optimize_init(void); +/*====================*/ + +/****************************************************************//** +Drops index ancillary tables for a FTS index +@return DB_SUCCESS or error code */ +dberr_t +fts_drop_index_tables( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: Index to drop */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Add the table to add to the OPTIMIZER's list. +@param[in] table table to add */ +void +fts_optimize_add_table( + dict_table_t* table); + +/******************************************************************//** +Remove the table from the OPTIMIZER's list. We do wait for +acknowledgement from the consumer of the message. */ +void +fts_optimize_remove_table( +/*======================*/ + dict_table_t* table); /*!< in: table to remove */ + +/** Shutdown fts optimize thread. */ +void +fts_optimize_shutdown(); + +/** Send sync fts cache for the table. +@param[in] table table to sync */ +void +fts_optimize_request_sync_table( + dict_table_t* table); + +/**********************************************************************//** +Take a FTS savepoint. */ +void +fts_savepoint_take( +/*===============*/ + fts_trx_t* fts_trx, /*!< in: fts transaction */ + const char* name); /*!< in: savepoint name */ + +/**********************************************************************//** +Refresh last statement savepoint. */ +void +fts_savepoint_laststmt_refresh( +/*===========================*/ + trx_t* trx); /*!< in: transaction */ + +/**********************************************************************//** +Release the savepoint data identified by name. */ +void +fts_savepoint_release( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/** Clear cache. +@param[in,out] cache fts cache */ +void +fts_cache_clear( + fts_cache_t* cache); + +/*********************************************************************//** +Initialize things in cache. */ +void +fts_cache_init( +/*===========*/ + fts_cache_t* cache); /*!< in: cache */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +void +fts_savepoint_rollback( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +void +fts_savepoint_rollback_last_stmt( +/*=============================*/ + trx_t* trx); /*!< in: transaction */ + +/** Drop all orphaned FTS auxiliary tables, those that don't have a parent +table or FTS index defined on them. */ +void fts_drop_orphaned_tables(); + +/** Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@param[in,out] table fts table +@param[in] wait whether to wait for existing sync to finish +@return DB_SUCCESS on success, error code on failure. */ +dberr_t fts_sync_table(dict_table_t* table, bool wait = true); + +/****************************************************************//** +Free the query graph but check whether dict_sys.mutex is already +held */ +void +fts_que_graph_free_check_lock( +/*==========================*/ + fts_table_t* fts_table, /*!< in: FTS table */ + const fts_index_cache_t*index_cache, /*!< in: FTS index cache */ + que_t* graph); /*!< in: query graph */ + +/****************************************************************//** +Create an FTS index cache. */ +CHARSET_INFO* +fts_index_get_charset( +/*==================*/ + dict_index_t* index); /*!< in: FTS index */ + +/*********************************************************************//** +Get the initial Doc ID by consulting the CONFIG table +@return initial Doc ID */ +doc_id_t +fts_init_doc_id( +/*============*/ + const dict_table_t* table); /*!< in: table */ + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp( +/*==================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Makes all characters in a string lower case. */ +extern +size_t +innobase_fts_casedn_str( +/*====================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + char* src, /*!< in: string to put in + lower case */ + size_t src_len, /*!< in: input string length */ + char* dst, /*!< in: buffer for result + string */ + size_t dst_len); /*!< in: buffer size */ + + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp_prefix( +/*=========================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +extern +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* charset, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past + end of text */ + fts_string_t* token); /*!< out: token's text */ + +/*************************************************************//** +Get token char size by charset +@return the number of token char size */ +ulint +fts_get_token_size( +/*===============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const char* token, /*!< in: token */ + ulint len); /*!< in: token length */ + +/*************************************************************//** +FULLTEXT tokenizer internal in MYSQL_FTPARSER_SIMPLE_MODE +@return 0 if tokenize sucessfully */ +int +fts_tokenize_document_internal( +/*===========================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser parameter */ + const char* doc, /*!< in: document to tokenize */ + int len); /*!< in: document length */ + +/*********************************************************************//** +Fetch COUNT(*) from specified table. +@return the number of rows in the table */ +ulint +fts_get_rows_count( +/*===============*/ + fts_table_t* fts_table); /*!< in: fts table to read */ + +/*************************************************************//** +Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists +@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */ +doc_id_t +fts_get_max_doc_id( +/*===============*/ + dict_table_t* table); /*!< in: user table */ + +/******************************************************************//** +Check whether user supplied stopword table exists and is of +the right format. +@return the stopword column charset if qualifies */ +CHARSET_INFO* +fts_valid_stopword_table( +/*=====================*/ + const char* stopword_table_name); /*!< in: Stopword table + name */ +/****************************************************************//** +This function loads specified stopword into FTS cache +@return true if success */ +bool +fts_load_stopword( +/*==============*/ + const dict_table_t* + table, /*!< in: Table with FTS */ + trx_t* trx, /*!< in: Transaction */ + const char* session_stopword_table, /*!< in: Session stopword table + name */ + bool stopword_is_on, /*!< in: Whether stopword + option is turned on/off */ + bool reload); /*!< in: Whether it is during + reload of FTS table */ + +/****************************************************************//** +Read the rows from the FTS index +@return DB_SUCCESS if OK */ +dberr_t +fts_table_fetch_doc_ids( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: aux table */ + fts_doc_ids_t* doc_ids); /*!< in: For collecting + doc ids */ +/****************************************************************//** +This function brings FTS index in sync when FTS index is first +used. There are documents that have not yet sync-ed to auxiliary +tables from last server abnormally shutdown, we will need to bring +such document into FTS cache before any further operations +@return TRUE if all OK */ +ibool +fts_init_index( +/*===========*/ + dict_table_t* table, /*!< in: Table with FTS */ + ibool has_cache_lock); /*!< in: Whether we already + have cache lock */ +/*******************************************************************//** +Add a newly create index in FTS cache */ +void +fts_add_index( +/*==========*/ + dict_index_t* index, /*!< FTS index to be added */ + dict_table_t* table); /*!< table */ + +/*******************************************************************//** +Drop auxiliary tables related to an FTS index +@return DB_SUCCESS or error number */ +dberr_t +fts_drop_index( +/*===========*/ + dict_table_t* table, /*!< in: Table where indexes are dropped */ + dict_index_t* index, /*!< in: Index to be dropped */ + trx_t* trx); /*!< in: Transaction for the drop */ + +/****************************************************************//** +Rename auxiliary tables for all fts index for a table +@return DB_SUCCESS or error code */ +dberr_t +fts_rename_aux_tables( +/*==================*/ + dict_table_t* table, /*!< in: user Table */ + const char* new_name, /*!< in: new table name */ + trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Check indexes in the fts->indexes is also present in index cache and +table->indexes list +@return TRUE if all indexes match */ +ibool +fts_check_cached_index( +/*===================*/ + dict_table_t* table); /*!< in: Table where indexes are dropped */ + +/** Fetch the document from tuple, tokenize the text data and +insert the text data into fts auxiliary table and +its cache. Moreover this tuple fields doesn't contain any information +about externally stored field. This tuple contains data directly +converted from mysql. +@param[in] ftt FTS transaction table +@param[in] doc_id doc id +@param[in] tuple tuple from where data can be retrieved + and tuple should be arranged in table + schema order. */ +void +fts_add_doc_from_tuple( + fts_trx_table_t*ftt, + doc_id_t doc_id, + const dtuple_t* tuple); + +/** Create an FTS trx. +@param[in,out] trx InnoDB Transaction +@return FTS transaction. */ +fts_trx_t* +fts_trx_create( + trx_t* trx); + +/** Clear all fts resources when there is no internal DOC_ID +and there are no new fts index to add. +@param[in,out] table table where fts is to be freed +@param[in] trx transaction to drop all fts tables */ +void fts_clear_all(dict_table_t *table, trx_t *trx); + +/** Check whether the given name is fts auxiliary table +and fetch the parent table id and index id +@param[in] name table name +@param[in,out] table_id parent table id +@param[in,out] index_id index id +@return true if it is auxilary table */ +bool fts_check_aux_table(const char *name, + table_id_t *table_id, + index_id_t *index_id); + +/** Sync the table during commit phase +@param[in] table table to be synced */ +void fts_sync_during_ddl(dict_table_t* table); diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h new file mode 100644 index 00000000..c527ad8e --- /dev/null +++ b/storage/innobase/include/fts0opt.h @@ -0,0 +1,39 @@ +/***************************************************************************** + +Copyright (c) 2001, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0opt.h +Full Text Search optimize thread + +Created 2011-02-15 Jimmy Yang +***********************************************************************/ +#ifndef INNODB_FTS0OPT_H +#define INNODB_FTS0OPT_H + +/** The FTS optimize thread's work queue. */ +extern ib_wqueue_t* fts_optimize_wq; + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. */ +ibool +fts_optimize_index_fetch_node( +/*==========================*/ + /* out: always returns non-NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg); /* in: pointer to ib_vector_t */ +#endif diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h new file mode 100644 index 00000000..8108e811 --- /dev/null +++ b/storage/innobase/include/fts0pars.h @@ -0,0 +1,72 @@ +/* A Bison parser, made by GNU Bison 2.5. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + FTS_OPER = 258, + FTS_TEXT = 259, + FTS_TERM = 260, + FTS_NUMB = 261 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 2068 of yacc.c */ +#line 61 "fts0pars.y" + + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; + + + +/* Line 2068 of yacc.c */ +#line 64 "fts0pars.hh" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + + + diff --git a/storage/innobase/include/fts0plugin.h b/storage/innobase/include/fts0plugin.h new file mode 100644 index 00000000..18ec2d6d --- /dev/null +++ b/storage/innobase/include/fts0plugin.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0plugin.h +Full text search plugin header file + +Created 2013/06/04 Shaohua Wang +***********************************************************************/ + +#ifndef INNOBASE_FTS0PLUGIN_H +#define INNOBASE_FTS0PLUGIN_H + +#include "univ.i" + +extern struct st_mysql_ftparser fts_default_parser; + +struct fts_ast_state_t; + +#define PARSER_INIT(parser, arg) if (parser->init) { parser->init(arg); } +#define PARSER_DEINIT(parser, arg) if (parser->deinit) { parser->deinit(arg); } + +/******************************************************************//** +fts parse query by plugin parser. +@return 0 if parse successfully, or return non-zero. */ +int +fts_parse_by_parser( +/*================*/ + ibool mode, /*!< in: query boolean mode */ + uchar* query, /*!< in: query string */ + ulint len, /*!< in: query string length */ + st_mysql_ftparser* parse, /*!< in: fts plugin parser */ + fts_ast_state_t* state); /*!< in: query parser state */ + +#endif /* INNOBASE_FTS0PLUGIN_H */ diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h new file mode 100644 index 00000000..4261fc25 --- /dev/null +++ b/storage/innobase/include/fts0priv.h @@ -0,0 +1,502 @@ +/***************************************************************************** + +Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.h +Full text search internal header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#ifndef INNOBASE_FTS0PRIV_H +#define INNOBASE_FTS0PRIV_H + +#include "dict0dict.h" +#include "pars0pars.h" +#include "que0que.h" +#include "que0types.h" +#include "fts0types.h" + +/* The various states of the FTS sub system pertaining to a table with +FTS indexes defined on it. */ +enum fts_table_state_enum { + /* !<This must be 0 since we insert + a hard coded '0' at create time + to the config table */ + + FTS_TABLE_STATE_RUNNING = 0, /*!< Auxiliary tables created OK */ + + FTS_TABLE_STATE_OPTIMIZING, /*!< This is a substate of RUNNING */ + + FTS_TABLE_STATE_DELETED /*!< All aux tables to be dropped when + it's safe to do so */ +}; + +typedef enum fts_table_state_enum fts_table_state_t; + +/** The default time to wait for the background thread (in microsecnds). */ +#define FTS_MAX_BACKGROUND_THREAD_WAIT 10000 + +/** Maximum number of iterations to wait before we complain */ +#define FTS_BACKGROUND_THREAD_WAIT_COUNT 1000 + +/** The maximum length of the config table's value column in bytes */ +#define FTS_MAX_CONFIG_NAME_LEN 64 + +/** The maximum length of the config table's value column in bytes */ +#define FTS_MAX_CONFIG_VALUE_LEN 1024 + +/** Approx. upper limit of ilist length in bytes. */ +#define FTS_ILIST_MAX_SIZE (64 * 1024) + +/** FTS config table name parameters */ + +/** The number of seconds after which an OPTIMIZE run will stop */ +#define FTS_OPTIMIZE_LIMIT_IN_SECS "optimize_checkpoint_limit" + +/** The next doc id */ +#define FTS_SYNCED_DOC_ID "synced_doc_id" + +/** The last word that was OPTIMIZED */ +#define FTS_LAST_OPTIMIZED_WORD "last_optimized_word" + +/** Total number of documents that have been deleted. The next_doc_id +minus this count gives us the total number of documents. */ +#define FTS_TOTAL_DELETED_COUNT "deleted_doc_count" + +/** Total number of words parsed from all documents */ +#define FTS_TOTAL_WORD_COUNT "total_word_count" + +/** Start of optimize of an FTS index */ +#define FTS_OPTIMIZE_START_TIME "optimize_start_time" + +/** End of optimize for an FTS index */ +#define FTS_OPTIMIZE_END_TIME "optimize_end_time" + +/** User specified stopword table name */ +#define FTS_STOPWORD_TABLE_NAME "stopword_table_name" + +/** Whether to use (turn on/off) stopword */ +#define FTS_USE_STOPWORD "use_stopword" + +/** State of the FTS system for this table. It can be one of + RUNNING, OPTIMIZING, DELETED. */ +#define FTS_TABLE_STATE "table_state" + +/** The minimum length of an FTS auxiliary table names's id component +e.g., For an auxiliary table name + + FTS_<TABLE_ID>_SUFFIX + +This constant is for the minimum length required to store the <TABLE_ID> +component. +*/ +#define FTS_AUX_MIN_TABLE_ID_LENGTH 48 + +/** Maximum length of an integer stored in the config table value column. */ +#define FTS_MAX_INT_LEN 32 + +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. +@return query graph */ +que_t* +fts_parse_sql( +/*==========*/ + fts_table_t* fts_table, /*!< in: FTS aux table */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ + MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result)); +/******************************************************************//** +Evaluate a parsed SQL statement +@return DB_SUCCESS or error code */ +dberr_t +fts_eval_sql( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + que_t* graph) /*!< in: Parsed statement */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Construct the name of an internal FTS table for the given table. +@param[in] fts_table metadata on fulltext-indexed table +@param[out] table_name a name up to MAX_FULL_NAME_LEN +@param[in] dict_locked whether dict_sys.mutex is being held */ +void fts_get_table_name(const fts_table_t* fts_table, char* table_name, + bool dict_locked = false) + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Construct the column specification part of the SQL string for selecting the +indexed FTS columns for the given table. Adds the necessary bound +ids to the given 'info' and returns the SQL string. Examples: + +One indexed column named "text": + + "$sel0", + info/ids: sel0 -> "text" + +Two indexed columns named "subject" and "content": + + "$sel0, $sel1", + info/ids: sel0 -> "subject", sel1 -> "content", +@return heap-allocated WHERE string */ +const char* +fts_get_select_columns_str( +/*=======================*/ + dict_index_t* index, /*!< in: FTS index */ + pars_info_t* info, /*!< in/out: parser info */ + mem_heap_t* heap) /*!< in: memory heap */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether +we want to get Doc whose ID is equal to or greater or smaller than supplied +ID */ +#define FTS_FETCH_DOC_BY_ID_EQUAL 1 +#define FTS_FETCH_DOC_BY_ID_LARGE 2 +#define FTS_FETCH_DOC_BY_ID_SMALL 3 + +/*************************************************************//** +Fetch document (= a single row's indexed text) with the given +document id. +@return: DB_SUCCESS if fetch is successful, else error */ +dberr_t +fts_doc_fetch_by_doc_id( +/*====================*/ + fts_get_doc_t* get_doc, /*!< in: state */ + doc_id_t doc_id, /*!< in: id of document to fetch */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ + ulint option, /*!< in: search option, if it is + greater than doc_id or equal */ + fts_sql_callback + callback, /*!< in: callback to read + records */ + void* arg) /*!< in: callback arg */ + MY_ATTRIBUTE((nonnull(6))); + +/*******************************************************************//** +Callback function for fetch that stores the text of an FTS document, +converting each column to UTF-16. +@return always FALSE */ +ibool +fts_query_expansion_fetch_doc( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************** +Write out a single word's data as new entry/entries in the INDEX table. +@return DB_SUCCESS if all OK. */ +dberr_t +fts_write_node( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: query graph */ + fts_table_t* fts_table, /*!< in: the FTS aux index */ + fts_string_t* word, /*!< in: word in UTF-8 */ + fts_node_t* node) /*!< in: node columns */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check if a fts token is a stopword or less than fts_min_token_size +or greater than fts_max_token_size. +@param[in] token token string +@param[in] stopwords stopwords rb tree +@param[in] cs token charset +@retval true if it is not stopword and length in range +@retval false if it is stopword or length not in range */ +bool +fts_check_token( + const fts_string_t* token, + const ib_rbt_t* stopwords, + const CHARSET_INFO* cs); + +/******************************************************************//** +Initialize a document. */ +void +fts_doc_init( +/*=========*/ + fts_doc_t* doc) /*!< in: doc to initialize */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************************//** +Do a binary search for a doc id in the array +@return +ve index if found -ve index where it should be + inserted if not found */ +int +fts_bsearch( +/*========*/ + doc_id_t* array, /*!< in: array to sort */ + int lower, /*!< in: lower bound of array*/ + int upper, /*!< in: upper bound of array*/ + doc_id_t doc_id) /*!< in: doc id to lookup */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Free document. */ +void +fts_doc_free( +/*=========*/ + fts_doc_t* doc) /*!< in: document */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Free fts_optimizer_word_t instanace.*/ +void +fts_word_free( +/*==========*/ + fts_word_t* word) /*!< in: instance to free.*/ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Read the rows from the FTS inde +@return DB_SUCCESS or error code */ +dberr_t +fts_index_fetch_nodes( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: prepared statement */ + fts_table_t* fts_table, /*!< in: FTS aux table */ + const fts_string_t* + word, /*!< in: the word to fetch */ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Compare two fts_trx_table_t instances, we actually compare the +table id's here. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* v1, /*!< in: id1 */ + const void* v2) /*!< in: id2 */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Compare a table id with a trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#define fts_sql_commit(trx) trx_commit_for_mysql(trx) +#define fts_sql_rollback(trx) (trx)->rollback() +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. Don't acquire +the dict mutex +@return query graph */ +que_t* +fts_parse_sql_no_dict_lock( +/*=======================*/ + pars_info_t* info, /*!< in: parser info */ + const char* sql) /*!< in: SQL string to evaluate */ + MY_ATTRIBUTE((nonnull(2), malloc, warn_unused_result)); +/******************************************************************//** +Get value from config table. The caller must ensure that enough +space is allocated for value to hold the column contents +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_value( +/*=================*/ + trx_t* trx, /* transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Get value specific to an FTS index from the config table. The caller +must ensure that enough space is allocated for value to hold the +column contents. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value in the config table for name. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + const fts_string_t* + value) /*!< in: value to update */ + MY_ATTRIBUTE((nonnull)); +/****************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +dberr_t +fts_config_set_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value specific to an FTS index in the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#ifdef FTS_OPTIMIZE_DEBUG +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* FTS_OPTIMIZE_DEBUG */ + +/******************************************************************//** +Set an ulint value int the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Search cache for word. +@return the word node vector if found else NULL */ +const ib_vector_t* +fts_cache_find_word( +/*================*/ + const fts_index_cache_t* + index_cache, /*!< in: cache to search */ + const fts_string_t* + text) /*!< in: word to search for */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/******************************************************************//** +Append deleted doc ids to vector and sort the vector. */ +void +fts_cache_append_deleted_doc_ids( +/*=============================*/ + const fts_cache_t* + cache, /*!< in: cache to use */ + ib_vector_t* vector); /*!< in: append to this vector */ +/******************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index specific cache else NULL */ +fts_index_cache_t* +fts_find_index_cache( +/*================*/ + const fts_cache_t* + cache, /*!< in: cache to search */ + const dict_index_t* + index) /*!< in: index to search for */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /*!< in: a table/index id */ + char* str); /*!< in: buffer to write the id to */ +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /*!< out: a table id */ + const char* str) /*!< in: buffer to read from */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Get the table id. +@return number of bytes written */ +int +fts_get_table_id( +/*=============*/ + const fts_table_t* + fts_table, /*!< in: FTS Auxiliary table */ + char* table_id) /*!< out: table id, must be at least + FTS_AUX_MIN_TABLE_ID_LENGTH bytes + long */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Construct the name of an internal FTS table for the given table. +@param[in] fts_table metadata on fulltext-indexed table +@param[in] dict_locked whether dict_sys.mutex is being held +@return the prefix, must be freed with ut_free() */ +char* fts_get_table_name_prefix(const fts_table_t* fts_table) + MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); +/******************************************************************//** +Add node positions. */ +void +fts_cache_node_add_positions( +/*=========================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_node_t* node, /*!< in: word node */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ + MY_ATTRIBUTE((nonnull(2,4))); + +/******************************************************************//** +Create the config table name for retrieving index specific value. +@return index config parameter name */ +char* +fts_config_create_index_param_name( +/*===============================*/ + const char* param, /*!< in: base name of param */ + const dict_index_t* index) /*!< in: index for config */ + MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); + +#include "fts0priv.ic" + +#endif /* INNOBASE_FTS0PRIV_H */ diff --git a/storage/innobase/include/fts0priv.ic b/storage/innobase/include/fts0priv.ic new file mode 100644 index 00000000..da14cfcb --- /dev/null +++ b/storage/innobase/include/fts0priv.ic @@ -0,0 +1,121 @@ +/***************************************************************************** + +Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.ic +Full text search internal header file + +Created 2011/11/12 Sunny Bains +***********************************************************************/ + +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /* in: a table/index id */ + char* str) /* in: buffer to write the id to */ +{ + +#ifdef _WIN32 + + DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name", + return(sprintf(str, UINT64PFx, id));); + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, "%016llu", (ulonglong) id));); + +#else /* _WIN32 */ + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name", + return(sprintf(str, "%016llu", (ulonglong) id));); + + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, "%016llx", (ulonglong) id));); + +#endif /* _WIN32 */ + + return(sprintf(str, "%016llx", (ulonglong) id)); +} + +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /* out: an id */ + const char* str) /* in: buffer to read from */ +{ + /* NOTE: this func doesn't care about whether current table + is set with HEX_NAME, the user of the id read here will check + if the id is HEX or DEC and do the right thing with it. */ + return(sscanf(str, UINT64PFx, id) == 1); +} + +/******************************************************************//** +Compare two fts_trx_table_t instances. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const dict_table_t* table1 + = (*static_cast<const fts_trx_table_t* const*>(p1))->table; + + const dict_table_t* table2 + = (*static_cast<const fts_trx_table_t* const*>(p2))->table; + + return((table1->id > table2->id) + ? 1 + : (table1->id == table2->id) + ? 0 + : -1); +} + +/******************************************************************//** +Compare a table id with a fts_trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const uintmax_t* table_id = static_cast<const uintmax_t*>(p1); + const dict_table_t* table2 + = (*static_cast<const fts_trx_table_t* const*>(p2))->table; + + return((*table_id > table2->id) + ? 1 + : (*table_id == table2->id) + ? 0 + : -1); +} diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h new file mode 100644 index 00000000..89655ca1 --- /dev/null +++ b/storage/innobase/include/fts0tlex.h @@ -0,0 +1,702 @@ +#ifndef fts0tHEADER_H +#define fts0tHEADER_H 1 +#define fts0tIN_HEADER 1 + +#line 6 "../include/fts0tlex.h" + +#line 8 "../include/fts0tlex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define fts0t_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer fts0t_create_buffer +#endif + +#ifdef yy_delete_buffer +#define fts0t_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer fts0t_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define fts0t_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer fts0t_scan_buffer +#endif + +#ifdef yy_scan_string +#define fts0t_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string fts0t_scan_string +#endif + +#ifdef yy_scan_bytes +#define fts0t_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes fts0t_scan_bytes +#endif + +#ifdef yy_init_buffer +#define fts0t_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer fts0t_init_buffer +#endif + +#ifdef yy_flush_buffer +#define fts0t_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer fts0t_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define fts0t_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state fts0t_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define fts0t_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer fts0t_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define fts0tpush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state fts0tpush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define fts0tpop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state fts0tpop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define fts0tensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack fts0tensure_buffer_stack +#endif + +#ifdef yylex +#define fts0tlex_ALREADY_DEFINED +#else +#define yylex fts0tlex +#endif + +#ifdef yyrestart +#define fts0trestart_ALREADY_DEFINED +#else +#define yyrestart fts0trestart +#endif + +#ifdef yylex_init +#define fts0tlex_init_ALREADY_DEFINED +#else +#define yylex_init fts0tlex_init +#endif + +#ifdef yylex_init_extra +#define fts0tlex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra fts0tlex_init_extra +#endif + +#ifdef yylex_destroy +#define fts0tlex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy fts0tlex_destroy +#endif + +#ifdef yyget_debug +#define fts0tget_debug_ALREADY_DEFINED +#else +#define yyget_debug fts0tget_debug +#endif + +#ifdef yyset_debug +#define fts0tset_debug_ALREADY_DEFINED +#else +#define yyset_debug fts0tset_debug +#endif + +#ifdef yyget_extra +#define fts0tget_extra_ALREADY_DEFINED +#else +#define yyget_extra fts0tget_extra +#endif + +#ifdef yyset_extra +#define fts0tset_extra_ALREADY_DEFINED +#else +#define yyset_extra fts0tset_extra +#endif + +#ifdef yyget_in +#define fts0tget_in_ALREADY_DEFINED +#else +#define yyget_in fts0tget_in +#endif + +#ifdef yyset_in +#define fts0tset_in_ALREADY_DEFINED +#else +#define yyset_in fts0tset_in +#endif + +#ifdef yyget_out +#define fts0tget_out_ALREADY_DEFINED +#else +#define yyget_out fts0tget_out +#endif + +#ifdef yyset_out +#define fts0tset_out_ALREADY_DEFINED +#else +#define yyset_out fts0tset_out +#endif + +#ifdef yyget_leng +#define fts0tget_leng_ALREADY_DEFINED +#else +#define yyget_leng fts0tget_leng +#endif + +#ifdef yyget_text +#define fts0tget_text_ALREADY_DEFINED +#else +#define yyget_text fts0tget_text +#endif + +#ifdef yyget_lineno +#define fts0tget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno fts0tget_lineno +#endif + +#ifdef yyset_lineno +#define fts0tset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno fts0tset_lineno +#endif + +#ifdef yyget_column +#define fts0tget_column_ALREADY_DEFINED +#else +#define yyget_column fts0tget_column +#endif + +#ifdef yyset_column +#define fts0tset_column_ALREADY_DEFINED +#else +#define yyset_column fts0tset_column +#endif + +#ifdef yywrap +#define fts0twrap_ALREADY_DEFINED +#else +#define yywrap fts0twrap +#endif + +#ifdef yyalloc +#define fts0talloc_ALREADY_DEFINED +#else +#define yyalloc fts0talloc +#endif + +#ifdef yyrealloc +#define fts0trealloc_ALREADY_DEFINED +#else +#define yyrealloc fts0trealloc +#endif + +#ifdef yyfree +#define fts0tfree_ALREADY_DEFINED +#else +#define yyfree fts0tfree +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void yyrestart ( FILE *input_file , yyscan_t yyscanner ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner ); +void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +void yypop_buffer_state ( yyscan_t yyscanner ); + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner ); + +void *yyalloc ( yy_size_t , yyscan_t yyscanner ); +void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner ); +void yyfree ( void * , yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0twrap(yyscanner) (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( yyscan_t yyscanner ); + +int yyget_debug ( yyscan_t yyscanner ); + +void yyset_debug ( int debug_flag , yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner ); + +FILE *yyget_in ( yyscan_t yyscanner ); + +void yyset_in ( FILE * _in_str , yyscan_t yyscanner ); + +FILE *yyget_out ( yyscan_t yyscanner ); + +void yyset_out ( FILE * _out_str , yyscan_t yyscanner ); + + int yyget_leng ( yyscan_t yyscanner ); + +char *yyget_text ( yyscan_t yyscanner ); + +int yyget_lineno ( yyscan_t yyscanner ); + +void yyset_lineno ( int _line_number , yyscan_t yyscanner ); + +int yyget_column ( yyscan_t yyscanner ); + +void yyset_column ( int _column_no , yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( yyscan_t yyscanner ); +#else +extern int yywrap ( yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * , yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#ifndef fts0t_create_buffer_ALREADY_DEFINED +#undef yy_create_buffer +#endif +#ifndef fts0t_delete_buffer_ALREADY_DEFINED +#undef yy_delete_buffer +#endif +#ifndef fts0t_scan_buffer_ALREADY_DEFINED +#undef yy_scan_buffer +#endif +#ifndef fts0t_scan_string_ALREADY_DEFINED +#undef yy_scan_string +#endif +#ifndef fts0t_scan_bytes_ALREADY_DEFINED +#undef yy_scan_bytes +#endif +#ifndef fts0t_init_buffer_ALREADY_DEFINED +#undef yy_init_buffer +#endif +#ifndef fts0t_flush_buffer_ALREADY_DEFINED +#undef yy_flush_buffer +#endif +#ifndef fts0t_load_buffer_state_ALREADY_DEFINED +#undef yy_load_buffer_state +#endif +#ifndef fts0t_switch_to_buffer_ALREADY_DEFINED +#undef yy_switch_to_buffer +#endif +#ifndef fts0tpush_buffer_state_ALREADY_DEFINED +#undef yypush_buffer_state +#endif +#ifndef fts0tpop_buffer_state_ALREADY_DEFINED +#undef yypop_buffer_state +#endif +#ifndef fts0tensure_buffer_stack_ALREADY_DEFINED +#undef yyensure_buffer_stack +#endif +#ifndef fts0tlex_ALREADY_DEFINED +#undef yylex +#endif +#ifndef fts0trestart_ALREADY_DEFINED +#undef yyrestart +#endif +#ifndef fts0tlex_init_ALREADY_DEFINED +#undef yylex_init +#endif +#ifndef fts0tlex_init_extra_ALREADY_DEFINED +#undef yylex_init_extra +#endif +#ifndef fts0tlex_destroy_ALREADY_DEFINED +#undef yylex_destroy +#endif +#ifndef fts0tget_debug_ALREADY_DEFINED +#undef yyget_debug +#endif +#ifndef fts0tset_debug_ALREADY_DEFINED +#undef yyset_debug +#endif +#ifndef fts0tget_extra_ALREADY_DEFINED +#undef yyget_extra +#endif +#ifndef fts0tset_extra_ALREADY_DEFINED +#undef yyset_extra +#endif +#ifndef fts0tget_in_ALREADY_DEFINED +#undef yyget_in +#endif +#ifndef fts0tset_in_ALREADY_DEFINED +#undef yyset_in +#endif +#ifndef fts0tget_out_ALREADY_DEFINED +#undef yyget_out +#endif +#ifndef fts0tset_out_ALREADY_DEFINED +#undef yyset_out +#endif +#ifndef fts0tget_leng_ALREADY_DEFINED +#undef yyget_leng +#endif +#ifndef fts0tget_text_ALREADY_DEFINED +#undef yyget_text +#endif +#ifndef fts0tget_lineno_ALREADY_DEFINED +#undef yyget_lineno +#endif +#ifndef fts0tset_lineno_ALREADY_DEFINED +#undef yyset_lineno +#endif +#ifndef fts0tget_column_ALREADY_DEFINED +#undef yyget_column +#endif +#ifndef fts0tset_column_ALREADY_DEFINED +#undef yyset_column +#endif +#ifndef fts0twrap_ALREADY_DEFINED +#undef yywrap +#endif +#ifndef fts0tget_lval_ALREADY_DEFINED +#undef yyget_lval +#endif +#ifndef fts0tset_lval_ALREADY_DEFINED +#undef yyset_lval +#endif +#ifndef fts0tget_lloc_ALREADY_DEFINED +#undef yyget_lloc +#endif +#ifndef fts0tset_lloc_ALREADY_DEFINED +#undef yyset_lloc +#endif +#ifndef fts0talloc_ALREADY_DEFINED +#undef yyalloc +#endif +#ifndef fts0trealloc_ALREADY_DEFINED +#undef yyrealloc +#endif +#ifndef fts0tfree_ALREADY_DEFINED +#undef yyfree +#endif +#ifndef fts0ttext_ALREADY_DEFINED +#undef yytext +#endif +#ifndef fts0tleng_ALREADY_DEFINED +#undef yyleng +#endif +#ifndef fts0tin_ALREADY_DEFINED +#undef yyin +#endif +#ifndef fts0tout_ALREADY_DEFINED +#undef yyout +#endif +#ifndef fts0t_flex_debug_ALREADY_DEFINED +#undef yy_flex_debug +#endif +#ifndef fts0tlineno_ALREADY_DEFINED +#undef yylineno +#endif +#ifndef fts0ttables_fload_ALREADY_DEFINED +#undef yytables_fload +#endif +#ifndef fts0ttables_destroy_ALREADY_DEFINED +#undef yytables_destroy +#endif +#ifndef fts0tTABLES_NAME_ALREADY_DEFINED +#undef yyTABLES_NAME +#endif + +#line 69 "fts0tlex.l" + + +#line 701 "../include/fts0tlex.h" +#undef fts0tIN_HEADER +#endif /* fts0tHEADER_H */ diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h new file mode 100644 index 00000000..1cddaf5b --- /dev/null +++ b/storage/innobase/include/fts0tokenize.h @@ -0,0 +1,189 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0tokenize.cc +Full Text Search plugin tokenizer refer to MyISAM + +Created 2014/11/17 Shaohua Wang +***********************************************************************/ + +#include "ft_global.h" +#include "mysql/plugin_ftparser.h" +#include "m_ctype.h" + +/* Macros and structs below are from ftdefs.h in MyISAM */ +/** Check a char is true word */ +#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_') + +/** Check if a char is misc word */ +#define misc_word_char(X) 0 + +/** Boolean search syntax */ +static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX; + +#define FTB_YES (fts_boolean_syntax[0]) +#define FTB_EGAL (fts_boolean_syntax[1]) +#define FTB_NO (fts_boolean_syntax[2]) +#define FTB_INC (fts_boolean_syntax[3]) +#define FTB_DEC (fts_boolean_syntax[4]) +#define FTB_LBR (fts_boolean_syntax[5]) +#define FTB_RBR (fts_boolean_syntax[6]) +#define FTB_NEG (fts_boolean_syntax[7]) +#define FTB_TRUNC (fts_boolean_syntax[8]) +#define FTB_LQUOT (fts_boolean_syntax[10]) +#define FTB_RQUOT (fts_boolean_syntax[11]) + +/** FTS query token */ +typedef struct st_ft_word { + uchar* pos; /*!< word start pointer */ + uint len; /*!< word len */ + double weight; /*!< word weight, unused in innodb */ +} FT_WORD; + +/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM. +Differences: a. code format changed; b. stopword processing removed. +@param[in] cs charset +@param[in,out] start doc start pointer +@param[in,out] end doc end pointer +@param[in,out] word token +@param[in,out] info token info +@retval 0 eof +@retval 1 word found +@retval 2 left bracket +@retval 3 right bracket +@retval 4 stopword found */ +inline +uchar +fts_get_word( + const CHARSET_INFO* cs, + uchar** start, + uchar* end, + FT_WORD* word, + MYSQL_FTPARSER_BOOLEAN_INFO* + info) +{ + uchar* doc = *start; + int ctype; + uint mwc; + uint length; + int mbl; + + info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0); + info->weight_adjust = info->wasign = 0; + info->type = FT_TOKEN_EOF; + + while (doc < end) { + for (; doc < end; + doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { + mbl = cs->ctype(&ctype, doc, end); + + if (true_word_char(ctype, *doc)) { + break; + } + + if (*doc == FTB_RQUOT && info->quot) { + *start = doc + 1; + info->type = FT_TOKEN_RIGHT_PAREN; + + return(info->type); + } + + if (!info->quot) { + if (*doc == FTB_LBR + || *doc == FTB_RBR + || *doc == FTB_LQUOT) { + /* param->prev=' '; */ + *start = doc + 1; + if (*doc == FTB_LQUOT) { + info->quot = (char*)1; + } + + info->type = (*doc == FTB_RBR ? + FT_TOKEN_RIGHT_PAREN : + FT_TOKEN_LEFT_PAREN); + + return(info->type); + } + + if (info->prev == ' ') { + if (*doc == FTB_YES) { + info->yesno = +1; + continue; + } else if (*doc == FTB_EGAL) { + info->yesno = 0; + continue; + } else if (*doc == FTB_NO) { + info->yesno = -1; + continue; + } else if (*doc == FTB_INC) { + info->weight_adjust++; + continue; + } else if (*doc == FTB_DEC) { + info->weight_adjust--; + continue; + } else if (*doc == FTB_NEG) { + info->wasign = !info->wasign; + continue; + } + } + } + + info->prev = char(*doc); + info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0); + info->weight_adjust = info->wasign = 0; + } + + mwc = length = 0; + for (word->pos = doc; + doc < end; + length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { + mbl = cs->ctype(&ctype, doc, end); + + if (true_word_char(ctype, *doc)) { + mwc = 0; + } else if (!misc_word_char(*doc) || mwc) { + break; + } else { + mwc++; + } + } + + /* Be sure *prev is true_word_char. */ + info->prev = 'A'; + word->len = (uint)(doc-word->pos) - mwc; + + if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) { + doc++; + } + + /* We don't check stopword here. */ + *start = doc; + info->type = FT_TOKEN_WORD; + + return(info->type); + } + + if (info->quot) { + *start = doc; + info->type = FT_TOKEN_RIGHT_PAREN; + } + + return(info->type); +} diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h new file mode 100644 index 00000000..f5760a16 --- /dev/null +++ b/storage/innobase/include/fts0types.h @@ -0,0 +1,386 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.h +Full text search types file + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_H +#define INNOBASE_FTS0TYPES_H + +#include "fts0fts.h" +#include "fut0fut.h" +#include "pars0pars.h" +#include "que0types.h" +#include "ut0byte.h" +#include "ut0rbt.h" + +/** Types used within FTS. */ +struct fts_que_t; +struct fts_node_t; + +/** Callbacks used within FTS. */ +typedef pars_user_func_cb_t fts_sql_callback; +typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); + +/** Statistics relevant to a particular document, used during retrieval. */ +struct fts_doc_stats_t { + doc_id_t doc_id; /*!< Document id */ + ulint word_count; /*!< Total words in the document */ +}; + +/** It's main purpose is to store the SQL prepared statements that +are required to retrieve a document from the database. */ +struct fts_get_doc_t { + fts_index_cache_t* + index_cache; /*!< The index cache instance */ + + /*!< Parsed sql statement */ + que_t* get_document_graph; + fts_cache_t* cache; /*!< The parent cache */ +}; + +/** Since we can have multiple FTS indexes on a table, we keep a +per index cache of words etc. */ +struct fts_index_cache_t { + dict_index_t* index; /*!< The FTS index instance */ + + ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, + cells are fts_tokenizer_word_t*.*/ + + ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t + contained in the memory buffer. + Must be in sorted order (ascending). + The ideal choice is an rb tree but + the rb tree imposes a space overhead + that we can do without */ + + que_t** ins_graph; /*!< Insert query graphs */ + + que_t** sel_graph; /*!< Select query graphs */ + CHARSET_INFO* charset; /*!< charset */ +}; + +/** Stop word control infotmation. */ +struct fts_stopword_t { + ulint status; /*!< Status of the stopword tree */ + ib_alloc_t* heap; /*!< The memory allocator to use */ + ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ + CHARSET_INFO* charset; /*!< charset for stopword */ +}; + +/** The SYNC state of the cache. There is one instance of this struct +associated with each ADD thread. */ +struct fts_sync_t { + trx_t* trx; /*!< The transaction used for SYNCing + the cache to disk */ + dict_table_t* table; /*!< Table with FTS index(es) */ + ulint max_cache_size; /*!< Max size in bytes of the cache */ + ibool cache_full; /*!< flag, when true it indicates that + we need to sync the cache to disk */ + ulint lower_index; /*!< the start index of the doc id + vector from where to start adding + documents to the FTS cache */ + ulint upper_index; /*!< max index of the doc id vector to + add to the FTS cache */ + ibool interrupted; /*!< TRUE if SYNC was interrupted */ + doc_id_t min_doc_id; /*!< The smallest doc id added to the + cache. It should equal to + doc_ids[lower_index] */ + doc_id_t max_doc_id; /*!< The doc id at which the cache was + noted as being full, we use this to + set the upper_limit field */ + time_t start_time; /*!< SYNC start time; only used if + fts_enable_diag_print */ + bool in_progress; /*!< flag whether sync is in progress.*/ + bool unlock_cache; /*!< flag whether unlock cache when + write fts node */ + os_event_t event; /*!< sync finish event; + only os_event_set() and os_event_wait() + are used */ +}; + +/** The cache for the FTS system. It is a memory-based inverted index +that new entries are added to, until it grows over the configured maximum +size, at which time its contents are written to the INDEX table. */ +struct fts_cache_t { + rw_lock_t lock; /*!< lock protecting all access to the + memory buffer. FIXME: this needs to + be our new upgrade-capable rw-lock */ + + rw_lock_t init_lock; /*!< lock used for the cache + intialization, it has different + SYNC level as above cache lock */ + + ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ + + ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ + + ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each + element is of type fts_update_t */ + + ib_vector_t* indexes; /*!< We store the stats and inverted + index for the individual FTS indexes + in this vector. Each element is + an instance of fts_index_cache_t */ + + ib_vector_t* get_docs; /*!< information required to read + the document from the table. Each + element is of type fts_doc_t */ + + size_t total_size; /*!< total size consumed by the ilist + field of all nodes. SYNC is run + whenever this gets too big */ + fts_sync_t* sync; /*!< sync structure to sync data to + disk */ + ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes + and deleted_doc_ids, ie. transient + objects, they are recreated after + a SYNC is completed */ + + ib_alloc_t* self_heap; /*!< This heap is the heap out of + which an instance of the cache itself + was created. Objects created using + this heap will last for the lifetime + of the cache */ + + doc_id_t next_doc_id; /*!< Next doc id */ + + doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ + + doc_id_t first_doc_id; /*!< first doc id since this table + was opened */ + + ulint deleted; /*!< Number of doc ids deleted since + last optimized. This variable is + covered by deleted_lock */ + + ulint added; /*!< Number of doc ids added since last + optimized. This variable is covered by + the deleted lock */ + + fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ + mem_heap_t* cache_heap; /*!< Cache Heap */ +}; + +/** Columns of the FTS auxiliary INDEX table */ +struct fts_node_t { + doc_id_t first_doc_id; /*!< First document id in ilist. */ + + doc_id_t last_doc_id; /*!< Last document id in ilist. */ + + byte* ilist; /*!< Binary list of documents & word + positions the token appears in. + TODO: For now, these are simply + ut_malloc'd, but if testing shows + that they waste memory unacceptably, a + special memory allocator will have + to be written */ + + ulint doc_count; /*!< Number of doc ids in ilist */ + + ulint ilist_size; /*!< Used size of ilist in bytes. */ + + ulint ilist_size_alloc; + /*!< Allocated size of ilist in + bytes */ + bool synced; /*!< flag whether the node is synced */ +}; + +/** A tokenizer word. Contains information about one word. */ +struct fts_tokenizer_word_t { + fts_string_t text; /*!< Token text. */ + + ib_vector_t* nodes; /*!< Word node ilists, each element is + of type fts_node_t */ +}; + +/** Word text plus it's array of nodes as on disk in FTS index */ +struct fts_word_t { + fts_string_t text; /*!< Word value in UTF-8 */ + ib_vector_t* nodes; /*!< Nodes read from disk */ + + ib_alloc_t* heap_alloc; /*!< For handling all allocations */ +}; + +/** Callback for reading and filtering nodes that are read from FTS index */ +struct fts_fetch_t { + void* read_arg; /*!< Arg for the sql_callback */ + + fts_sql_callback + read_record; /*!< Callback for reading index + record */ + size_t total_memory; /*!< Total memory used */ +}; + +/** For horizontally splitting an FTS auxiliary index */ +struct fts_index_selector_t { + ulint value; /*!< Character value at which + to split */ + + const char* suffix; /*!< FTS aux index suffix */ +}; + +/** This type represents a single document. */ +struct fts_doc_t { + fts_string_t text; /*!< document text */ + + ibool found; /*!< TRUE if the document was found + successfully in the database */ + + ib_rbt_t* tokens; /*!< This is filled when the document + is tokenized. Tokens; indexed by + fts_string_t*, cells are of type + fts_token_t* */ + + ib_alloc_t* self_heap; /*!< An instance of this type is + allocated from this heap along + with any objects that have the + same lifespan, most notably + the vector of token positions */ + CHARSET_INFO* charset; /*!< Document's charset info */ + + st_mysql_ftparser* parser; /*!< fts plugin parser */ + + ib_rbt_t* stopwords; /*!< Stopwords */ +}; + +/** A token and its positions within a document. */ +struct fts_token_t { + fts_string_t text; /*!< token text */ + + ib_vector_t* positions; /*!< an array of the positions the + token is found in; each item is + actually an ulint. */ +}; + +/** It's defined in fts/fts0fts.c */ +extern const fts_index_selector_t fts_index_selector[]; + +/******************************************************************//** +Compare two fts_trx_row_t instances doc_ids. */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two fts_ranking_t instances doc_ids. */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two doc_ids. */ +UNIV_INLINE +int fts_doc_id_cmp( +/*==================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Decode and return the integer that was encoded using our VLC scheme.*/ +UNIV_INLINE +ulint +fts_decode_vlc( +/*===========*/ + /*!< out: value decoded */ + byte** ptr); /*!< in: ptr to decode from, this ptr is + incremented by the number of bytes decoded */ + +/******************************************************************//** +Duplicate a string. */ +UNIV_INLINE +void +fts_string_dup( +/*===========*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap); /*!< in: heap to use */ + +/******************************************************************//** +Return length of val if it were encoded using our VLC scheme. */ +UNIV_INLINE +ulint +fts_get_encoded_len( +/*================*/ + /*!< out: length of value + encoded, in bytes */ + ulint val); /*!< in: value to encode */ + +/******************************************************************//** +Encode an integer using our VLC scheme and return the length in bytes. */ +UNIV_INLINE +ulint +fts_encode_int( +/*===========*/ + /*!< out: length of value + encoded, in bytes */ + ulint val, /*!< in: value to encode */ + byte* buf); /*!< in: buffer, must have + enough space */ + +/******************************************************************//** +Get the selected FTS aux INDEX suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected); /*!< in: selected index */ + +/** Select the FTS auxiliary index for the given character. +@param[in] cs charset +@param[in] str string +@param[in] len string length in bytes +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( + const CHARSET_INFO* cs, + const byte* str, + ulint len); + +#include "fts0types.ic" +#include "fts0vlc.ic" + +#endif /* INNOBASE_FTS0TYPES_H */ diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic new file mode 100644 index 00000000..facc1e5c --- /dev/null +++ b/storage/innobase/include/fts0types.ic @@ -0,0 +1,231 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.ic +Full text search types. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_IC +#define INNOBASE_FTS0TYPES_IC + +/******************************************************************//** +Duplicate a string. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +void +fts_string_dup( +/*===========*/ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap) /*!< in: heap to use */ +{ + dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1); + memcpy(dst->f_str, src->f_str, src->f_len); + + dst->f_len = src->f_len; + dst->f_str[src->f_len] = 0; + dst->f_n_char = src->f_n_char; +} + +/******************************************************************//** +Compare two fts_trx_row_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; + const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; + + return((int)(tr1->doc_id - tr2->doc_id)); +} + +/******************************************************************//** +Compare two fts_ranking_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; + const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; + + return((int)(rk1->doc_id - rk2->doc_id)); +} + +/******************************************************************//** +Compare two doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int fts_doc_id_cmp( +/*==================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const doc_id_t* up1 = static_cast<const doc_id_t*>(p1); + const doc_id_t* up2 = static_cast<const doc_id_t*>(p2); + + return static_cast<int>(*up1 - *up2); +} + +/******************************************************************//** +Get the first character's code position for FTS index partition */ +extern +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const uchar* p2, /*!< in: string */ + const ulint len2); /*!< in: string length */ + +/** Check if fts index charset is cjk +@param[in] cs charset +@retval true if the charset is cjk +@retval false if not. */ +inline bool fts_is_charset_cjk(const CHARSET_INFO* cs) +{ + switch (cs->number) { + case 24: /* my_charset_gb2312_chinese_ci */ + case 28: /* my_charset_gbk_chinese_ci */ + case 1: /* my_charset_big5_chinese_ci */ + case 12: /* my_charset_ujis_japanese_ci */ + case 13: /* my_charset_sjis_japanese_ci */ + case 95: /* my_charset_cp932_japanese_ci */ + case 97: /* my_charset_eucjpms_japanese_ci */ + case 19: /* my_charset_euckr_korean_ci */ + return true; + default: + return false; + } +} + +/** Select the FTS auxiliary index for the given character by range. +@param[in] cs charset +@param[in] str string +@param[in] len string length +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index_by_range( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected); + + } else if (fts_index_selector[selected].value > value) { + + return(selected > 0 ? selected - 1 : 0); + } + + ++selected; + } + + ut_ad(selected > 1); + + return(selected - 1); +} + +/** Select the FTS auxiliary index for the given character by hash. +@param[in] cs charset +@param[in] str string +@param[in] len string length +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index_by_hash( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulong nr1 = 1; + ulong nr2 = 4; + + ut_ad(!(str == NULL && len > 0)); + + if (str == NULL || len == 0) { + return 0; + } + + /* Get the first char */ + /* JAN: TODO: MySQL 5.7 had + char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str), + reinterpret_cast<const char*>(str + len)); + */ + size_t char_len = size_t(cs->charlen(str, str + len)); + + ut_ad(char_len <= len); + + /* Get collation hash code */ + my_ci_hash_sort(cs, str, char_len, &nr1, &nr2); + + return(nr1 % FTS_NUM_AUX_INDEX); +} + +/** Select the FTS auxiliary index for the given character. +@param[in] cs charset +@param[in] str string +@param[in] len string length in bytes +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulint selected; + + if (fts_is_charset_cjk(cs)) { + selected = fts_select_index_by_hash(cs, str, len); + } else { + selected = fts_select_index_by_range(cs, str, len); + } + + return(selected); +} + +/******************************************************************//** +Return the selected FTS aux index suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected) /*!< in: selected index */ +{ + return(fts_index_selector[selected].suffix); +} + +#endif /* INNOBASE_FTS0TYPES_IC */ diff --git a/storage/innobase/include/fts0vlc.ic b/storage/innobase/include/fts0vlc.ic new file mode 100644 index 00000000..75d85350 --- /dev/null +++ b/storage/innobase/include/fts0vlc.ic @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0vlc.ic +Full text variable length integer encoding/decoding. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0VLC_IC +#define INNOBASE_FTS0VLC_IC + +#include "fts0types.h" + +/******************************************************************//** +Return length of val if it were encoded using our VLC scheme. +FIXME: We will need to be able encode 8 bytes value +@return length of value encoded, in bytes */ +UNIV_INLINE +ulint +fts_get_encoded_len( +/*================*/ + ulint val) /* in: value to encode */ +{ + if (val <= 127) { + return(1); + } else if (val <= 16383) { + return(2); + } else if (val <= 2097151) { + return(3); + } else if (val <= 268435455) { + return(4); + } else { + /* Possibly we should care that on 64-bit machines ulint can + contain values that we can't encode in 5 bytes, but + fts_encode_int doesn't handle them either so it doesn't much + matter. */ + + return(5); + } +} + +/******************************************************************//** +Encode an integer using our VLC scheme and return the length in bytes. +@return length of value encoded, in bytes */ +UNIV_INLINE +ulint +fts_encode_int( +/*===========*/ + ulint val, /* in: value to encode */ + byte* buf) /* in: buffer, must have enough space */ +{ + ulint len; + + if (val <= 127) { + *buf = (byte) val; + + len = 1; + } else if (val <= 16383) { + *buf++ = (byte)(val >> 7); + *buf = (byte)(val & 0x7F); + + len = 2; + } else if (val <= 2097151) { + *buf++ = (byte)(val >> 14); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 3; + } else if (val <= 268435455) { + *buf++ = (byte)(val >> 21); + *buf++ = (byte)((val >> 14) & 0x7F); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 4; + } else { + /* Best to keep the limitations of the 32/64 bit versions + identical, at least for the time being. */ + ut_ad(val <= 4294967295u); + + *buf++ = (byte)(val >> 28); + *buf++ = (byte)((val >> 21) & 0x7F); + *buf++ = (byte)((val >> 14) & 0x7F); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 5; + } + + /* High-bit on means "last byte in the encoded integer". */ + *buf |= 0x80; + + return(len); +} + +/******************************************************************//** +Decode and return the integer that was encoded using our VLC scheme. +@return value decoded */ +UNIV_INLINE +ulint +fts_decode_vlc( +/*===========*/ + byte** ptr) /* in: ptr to decode from, this ptr is + incremented by the number of bytes decoded */ +{ + ulint val = 0; + + for (;;) { + byte b = **ptr; + + ++*ptr; + val |= (b & 0x7F); + + /* High-bit on means "last byte in the encoded integer". */ + if (b & 0x80) { + break; + } else { + val <<= 7; + } + } + + return(val); +} + +#endif diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h new file mode 100644 index 00000000..a52fc256 --- /dev/null +++ b/storage/innobase/include/fut0fut.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0fut.h +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + + +#ifndef fut0fut_h +#define fut0fut_h + +#include "mtr0mtr.h" + +/** Gets a pointer to a file address and latches the page. +@param[in] space space id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] addr file address +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_SX_LATCH +@param[out] ptr_block file page +@param[in,out] mtr mini-transaction +@return pointer to a byte in (*ptr_block)->frame; the *ptr_block is +bufferfixed and latched */ +UNIV_INLINE +byte* +fut_get_ptr( + ulint space, + ulint zip_size, + fil_addr_t addr, + rw_lock_type_t rw_latch, + mtr_t* mtr, + buf_block_t** ptr_block = NULL) +{ + buf_block_t* block; + byte* ptr = NULL; + + ut_ad(addr.boffset < srv_page_size); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH)); + + block = buf_page_get(page_id_t(space, addr.page), zip_size, + rw_latch, mtr); + + ptr = buf_block_get_frame(block) + addr.boffset; + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + if (ptr_block != NULL) { + *ptr_block = block; + } + + return(ptr); +} + +#endif /* fut0fut_h */ diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h new file mode 100644 index 00000000..1ade24cd --- /dev/null +++ b/storage/innobase/include/fut0lst.h @@ -0,0 +1,163 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0lst.h +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef fut0lst_h +#define fut0lst_h + +#ifdef UNIV_INNOCHECKSUM +# include "fil0fil.h" +#else +#include "fut0fut.h" +#include "mtr0log.h" + +/* The C 'types' of base node and list node: these should be used to +write self-documenting code. Of course, the sizeof macro cannot be +applied to these types! */ + +typedef byte flst_base_node_t; +typedef byte flst_node_t; + +#endif /* !UNIV_INNOCHECKSUM */ + +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) +/* The physical size of a list node in bytes */ +#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) + +#ifndef UNIV_INNOCHECKSUM +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/** Initialize a zero-initialized list base node. +@param[in,out] block file page +@param[in] ofs byte offset of the list base node +@param[in,out] mtr mini-transaction */ +inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr) +{ + ut_ad(!mach_read_from_2(FLST_LEN + ofs + block->frame)); + ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + block->frame)); + ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + block->frame)); + compile_time_assert(FIL_NULL == 0xffU * 0x1010101U); + mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff); + mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff); +} + +/** Initialize a list base node. +@param[in] block file page +@param[in,out] base base node +@param[in,out] mtr mini-transaction */ +void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** Append a file list node to a list. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] add block to be added +@param[in] aoffset byte offset of the node to be added +@param[in,outr] mtr mini-transaction */ +void flst_add_last(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +/** Prepend a file list node to a list. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] add block to be added +@param[in] aoffset byte offset of the node to be added +@param[in,outr] mtr mini-transaction */ +void flst_add_first(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +/** Remove a file list node. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] cur block to be removed +@param[in] coffset byte offset of the current record to be removed +@param[in,outr] mtr mini-transaction */ +void flst_remove(buf_block_t *base, uint16_t boffset, + buf_block_t *cur, uint16_t coffset, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** @return the length of a list */ +inline uint32_t flst_get_len(const flst_base_node_t *base) +{ + return mach_read_from_4(base + FLST_LEN); +} + +/** @return a file address */ +inline fil_addr_t flst_read_addr(const byte *faddr) +{ + fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE), + mach_read_from_2(faddr + FIL_ADDR_BYTE) }; + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); + return addr; +} + +/** @return list first node address */ +inline fil_addr_t flst_get_first(const flst_base_node_t *base) +{ + return flst_read_addr(base + FLST_FIRST); +} + +/** @return list last node address */ +inline fil_addr_t flst_get_last(const flst_base_node_t *base) +{ + return flst_read_addr(base + FLST_LAST); +} + +/** @return list next node address */ +inline fil_addr_t flst_get_next_addr(const flst_node_t* node) +{ + return flst_read_addr(node + FLST_NEXT); +} + +/** @return list prev node address */ +inline fil_addr_t flst_get_prev_addr(const flst_node_t *node) +{ + return flst_read_addr(node + FLST_PREV); +} + +#ifdef UNIV_DEBUG +/** Validate a file-based list. */ +void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr); +#endif + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h new file mode 100644 index 00000000..3fd01a3a --- /dev/null +++ b/storage/innobase/include/gis0geo.h @@ -0,0 +1,122 @@ +/***************************************************************************** +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software Foundation, +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*****************************************************************************/ + +/**************************************************//** +@file gis0geo.h +The r-tree define from MyISAM +*******************************************************/ + +#ifndef _gis0geo_h +#define _gis0geo_h + +#include "my_global.h" +#include "string.h" + +#define SPTYPE HA_KEYTYPE_DOUBLE +#define SPLEN 8 + +/* Since the mbr could be a point or a linestring, in this case, area of +mbr is 0. So, we define this macro for calculating the area increasing +when we need to enlarge the mbr. */ +#define LINE_MBR_WEIGHTS 0.001 + +/* Types of "well-known binary representation" (wkb) format. */ +enum wkbType +{ + wkbPoint = 1, + wkbLineString = 2, + wkbPolygon = 3, + wkbMultiPoint = 4, + wkbMultiLineString = 5, + wkbMultiPolygon = 6, + wkbGeometryCollection = 7 +}; + +/* Byte order of "well-known binary representation" (wkb) format. */ +enum wkbByteOrder +{ + wkbXDR = 0, /* Big Endian */ + wkbNDR = 1 /* Little Endian */ +}; + +/*************************************************************//** +Calculate minimal bounding rectangle (mbr) of the spatial object +stored in "well-known binary representation" (wkb) format. +@return 0 if ok */ +int +rtree_mbr_from_wkb( +/*===============*/ + const uchar* wkb, /*!< in: pointer to wkb. */ + uint size, /*!< in: size of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr); /*!< in/out: mbr. */ + +/* Rtree split node structure. */ +struct rtr_split_node_t +{ + double square; /* square of the mbr.*/ + int n_node; /* which group in.*/ + uchar* key; /* key. */ + double* coords; /* mbr. */ +}; + +/*************************************************************//** +Inline function for reserving coords */ +inline +static +double* +reserve_coords(double **d_buffer, /*!< in/out: buffer. */ + int n_dim) /*!< in: dimensions. */ +/*===========*/ +{ + double *coords = *d_buffer; + (*d_buffer) += n_dim * 2; + return coords; +} + +/*************************************************************//** +Split rtree nodes. +Return which group the first rec is in. */ +int +split_rtree_node( +/*=============*/ + rtr_split_node_t* node, /*!< in: split nodes.*/ + int n_entries, /*!< in: entries number.*/ + int all_size, /*!< in: total key's size.*/ + int key_size, /*!< in: key's size.*/ + int min_size, /*!< in: minimal group size.*/ + int size1, /*!< in: size of group.*/ + int size2, /*!< in: initial group sizes */ + double** d_buffer, /*!< in/out: buffer.*/ + int n_dim, /*!< in: dimensions. */ + uchar* first_rec); /*!< in: the first rec. */ + +/** Compare two minimum bounding rectangles. +@param mode comparison operator + MBR_INTERSECT(a,b) a overlaps b + MBR_CONTAIN(a,b) a contains b + MBR_DISJOINT(a,b) a disjoint b + MBR_WITHIN(a,b) a within b + MBR_EQUAL(a,b) All coordinates of MBRs are equal + MBR_DATA(a,b) Data reference is the same +@param b first MBR +@param a second MBR +@retval 0 if the predicate holds +@retval 1 if the precidate does not hold */ +int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a); +#endif diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h new file mode 100644 index 00000000..f7a2d6cd --- /dev/null +++ b/storage/innobase/include/gis0rtree.h @@ -0,0 +1,494 @@ +/***************************************************************************** + +Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include gis0rtree.h +R-tree header file + +Created 2013/03/27 Jimmy Yang and Allen Lai +***********************************************************************/ + +#ifndef gis0rtree_h +#define gis0rtree_h + +#include "btr0cur.h" +#include "rem0types.h" + +/* Whether MBR 'a' contains 'b' */ +#define MBR_CONTAIN_CMP(a, b) \ + ((((b)->xmin >= (a)->xmin) && ((b)->xmax <= (a)->xmax) \ + && ((b)->ymin >= (a)->ymin) && ((b)->ymax <= (a)->ymax))) + +/* Whether MBR 'a' equals to 'b' */ +#define MBR_EQUAL_CMP(a, b) \ + ((((b)->xmin == (a)->xmin) && ((b)->xmax == (a)->xmax)) \ + && (((b)->ymin == (a)->ymin) && ((b)->ymax == (a)->ymax))) + +/* Whether MBR 'a' intersects 'b' */ +#define MBR_INTERSECT_CMP(a, b) \ + ((((b)->xmin <= (a)->xmax) || ((b)->xmax >= (a)->xmin)) \ + && (((b)->ymin <= (a)->ymax) || ((b)->ymax >= (a)->ymin))) + +/* Whether MBR 'a' and 'b' disjoint */ +#define MBR_DISJOINT_CMP(a, b) (!MBR_INTERSECT_CMP(a, b)) + +/* Whether MBR 'a' within 'b' */ +#define MBR_WITHIN_CMP(a, b) \ + ((((b)->xmin <= (a)->xmin) && ((b)->xmax >= (a)->xmax)) \ + && (((b)->ymin <= (a)->ymin) && ((b)->ymax >= (a)->ymax))) + +/* Define it for rtree search mode checking. */ +#define RTREE_SEARCH_MODE(mode) \ + (((mode) >= PAGE_CUR_CONTAIN) && ((mode <= PAGE_CUR_RTREE_GET_FATHER))) + +/* Geometry data header */ +#define GEO_DATA_HEADER_SIZE 4 +/**********************************************************************//** +Builds a Rtree node pointer out of a physical record and a page number. +@return own: node pointer */ +dtuple_t* +rtr_index_build_node_ptr( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + const rtr_mbr_t* mbr, /*!< in: mbr of lower page */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap); /*!< in: memory heap where pointer + created */ + +/*************************************************************//** +Splits an R-tree index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. +@return inserted record */ +rec_t* +rtr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in/out: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +Sets the child node mbr in a node pointer. */ +UNIV_INLINE +void +rtr_page_cal_mbr( +/*=============*/ + const dict_index_t* index, /*!< in: index */ + const buf_block_t* block, /*!< in: buffer block */ + rtr_mbr_t* mbr, /*!< out: MBR encapsulates the page */ + mem_heap_t* heap); /*!< in: heap for the memory + allocation */ +/*************************************************************//** +Find the next matching record. This function will first exhaust +the copied record listed in the rtr_info->matches vector before +moving to next page +@return true if there is next qualified record found, otherwise(if +exhausted) false */ +bool +rtr_pcur_move_to_next( +/*==================*/ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + page_cur_mode_t mode, /*!< in: cursor search mode */ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + ulint cur_level, + /*!< in: current level */ + mtr_t* mtr); /*!< in: mtr */ + +/****************************************************************//** +Searches the right position in rtree for a page cursor. */ +bool +rtr_cur_search_with_match( +/*======================*/ + const buf_block_t* block, /*!< in: buffer block */ + dict_index_t* index, /*!< in: index descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor, /*!< in/out: page cursor */ + rtr_info_t* rtr_info);/*!< in/out: search stack */ + +/****************************************************************//** +Calculate the area increased for a new record +@return area increased */ +double +rtr_rec_cal_increase( +/*=================*/ + const dtuple_t* dtuple, /*!< in: data tuple to insert, which + cause area increase */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + double* area); /*!< out: increased area */ + +/****************************************************************//** +Following the right link to find the proper block for insert. +@return the proper block.*/ +dberr_t +rtr_ins_enlarge_mbr( +/*=================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +push a nonleaf index node to the search path */ +UNIV_INLINE +void +rtr_non_leaf_stack_push( +/*====================*/ + rtr_node_path_t* path, /*!< in/out: search path */ + uint32_t pageno, /*!< in: pageno to insert */ + node_seq_t seq_no, /*!< in: Node sequence num */ + ulint level, /*!< in: index level */ + uint32_t child_no, /*!< in: child page no */ + btr_pcur_t* cursor, /*!< in: position cursor */ + double mbr_inc); /*!< in: MBR needs to be + enlarged */ + +/**************************************************************//** +push a nonleaf index node to the search path for insertion */ +void +rtr_non_leaf_insert_stack_push( +/*===========================*/ + dict_index_t* index, /*!< in: index descriptor */ + rtr_node_path_t* path, /*!< in/out: search path */ + ulint level, /*!< in: index level */ + const buf_block_t* block, /*!< in: block of the page */ + const rec_t* rec, /*!< in: positioned record */ + double mbr_inc); /*!< in: MBR needs to be + enlarged */ + +#define rtr_get_new_ssn_id(index) (index)->assign_ssn() +#define rtr_get_current_ssn_id(index) (index)->ssn() + +/********************************************************************//** +Create a RTree search info structure */ +rtr_info_t* +rtr_create_rtr_info( +/******************/ + bool need_prdt, /*!< in: Whether predicate lock is + needed */ + bool init_matches, /*!< in: Whether to initiate the + "matches" structure for collecting + matched leaf records */ + btr_cur_t* cursor, /*!< in: tree search cursor */ + dict_index_t* index); /*!< in: index struct */ + +/********************************************************************//** +Update a btr_cur_t with rtr_info */ +void +rtr_info_update_btr( +/******************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + rtr_info_t* rtr_info); /*!< in: rtr_info to set to the + cursor */ + +/********************************************************************//** +Update a btr_cur_t with rtr_info */ +void +rtr_init_rtr_info( +/****************/ + rtr_info_t* rtr_info, /*!< in: rtr_info to set to the + cursor */ + bool need_prdt, /*!< in: Whether predicate lock is + needed */ + btr_cur_t* cursor, /*!< in: tree search cursor */ + dict_index_t* index, /*!< in: index structure */ + bool reinit); /*!< in: Whether this is a reinit */ + +/**************************************************************//** +Clean up Rtree cursor */ +void +rtr_clean_rtr_info( +/*===============*/ + rtr_info_t* rtr_info, /*!< in: RTree search info */ + bool free_all); /*!< in: need to free rtr_info itself */ + +/****************************************************************//** +Get the bounding box content from an index record*/ +void +rtr_get_mbr_from_rec( +/*=================*/ + const rec_t* rec, /*!< in: data tuple */ + const rec_offs* offsets,/*!< in: offsets array */ + rtr_mbr_t* mbr); /*!< out MBR */ + +/****************************************************************//** +Get the bounding box content from a MBR data record */ +void +rtr_get_mbr_from_tuple( +/*===================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + rtr_mbr* mbr); /*!< out: mbr to fill */ + +/* Get the rtree page father. +@param[in] offsets work area for the return value +@param[in] index rtree index +@param[in] block child page in the index +@param[in] mtr mtr +@param[in] sea_cur search cursor, contains information + about parent nodes in search +@param[in] cursor cursor on node pointer record, + its page x-latched */ +void +rtr_page_get_father( + dict_index_t* index, + buf_block_t* block, + mtr_t* mtr, + btr_cur_t* sea_cur, + btr_cur_t* cursor); + +/************************************************************//** +Returns the father block to a page. It is assumed that mtr holds +an X or SX latch on the tree. +@return rec_get_offsets() of the node pointer record */ +rec_offs* +rtr_page_get_father_block( +/*======================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + dict_index_t* index, /*!< in: b-tree index */ + buf_block_t* block, /*!< in: child page in the index */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* sea_cur,/*!< in: search cursor, contains information + about parent nodes in search */ + btr_cur_t* cursor);/*!< out: cursor on node pointer record, + its page x-latched */ +/**************************************************************//** +Store the parent path cursor +@return number of cursor stored */ +ulint +rtr_store_parent_path( +/*==================*/ + const buf_block_t* block, /*!< in: block of the page */ + btr_cur_t* btr_cur,/*!< in/out: persistent cursor */ + ulint latch_mode, + /*!< in: latch_mode */ + ulint level, /*!< in: index level */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +void +rtr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ + +#define rtr_pcur_open(i,t,md,l,c,m) \ + rtr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m) + +struct btr_cur_t; + +/*********************************************************//** +Returns the R-Tree node stored in the parent search path +@return pointer to R-Tree cursor component */ +UNIV_INLINE +node_visit_t* +rtr_get_parent_node( +/*================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert); /*!< in: whether it is insert */ + +/*********************************************************//** +Returns the R-Tree cursor stored in the parent search path +@return pointer to R-Tree cursor component */ +UNIV_INLINE +btr_pcur_t* +rtr_get_parent_cursor( +/*==================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert); /*!< in: whether insert operation */ + +/*************************************************************//** +Copy recs from a page to new_block of rtree. */ +void +rtr_page_copy_rec_list_end_no_locks( +/*================================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in/out: heap memory */ + rtr_rec_move_t* rec_move, /*!< in: recording records moved */ + ulint max_move, /*!< in: num of rec to move */ + ulint* num_moved, /*!< out: num of rec to move */ + mtr_t* mtr); /*!< in: mtr */ + +/*************************************************************//** +Copy recs till a specified rec from a page to new_block of rtree. */ +void +rtr_page_copy_rec_list_start_no_locks( +/*==================================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in/out: heap memory */ + rtr_rec_move_t* rec_move, /*!< in: recording records moved */ + ulint max_move, /*!< in: num of rec to move */ + ulint* num_moved, /*!< out: num of rec to move */ + mtr_t* mtr); /*!< in: mtr */ + +/****************************************************************//** +Merge 2 mbrs and update the the mbr that cursor is on. */ +dberr_t +rtr_merge_and_update_mbr( +/*=====================*/ + btr_cur_t* cursor, /*!< in/out: cursor */ + btr_cur_t* cursor2, /*!< in: the other cursor */ + rec_offs* offsets, /*!< in: rec offsets */ + rec_offs* offsets2, /*!< in: rec offsets */ + page_t* child_page, /*!< in: the child page. */ + mtr_t* mtr); /*!< in: mtr */ + +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +void +rtr_node_ptr_delete( +/*================*/ + btr_cur_t* cursor, /*!< in: search cursor, contains information + about parent nodes in search */ + mtr_t* mtr); /*!< in: mtr */ + +/****************************************************************//** +Check two MBRs are identical or need to be merged */ +bool +rtr_merge_mbr_changed( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor */ + btr_cur_t* cursor2, /*!< in: the other cursor */ + rec_offs* offsets, /*!< in: rec offsets */ + rec_offs* offsets2, /*!< in: rec offsets */ + rtr_mbr_t* new_mbr); /*!< out: MBR to update */ + + +/**************************************************************//** +Update the mbr field of a spatial index row. +@return true if successful */ +bool +rtr_update_mbr_field( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor pointed to rec.*/ + rec_offs* offsets, /*!< in: offsets on rec. */ + btr_cur_t* cursor2, /*!< in/out: cursor pointed to rec + that should be deleted. + this cursor is for btr_compress to + delete the merged page's father rec.*/ + page_t* child_page, /*!< in: child page. */ + rtr_mbr_t* new_mbr, /*!< in: the new mbr. */ + rec_t* new_rec, /*!< in: rec to use */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +Check whether a Rtree page is child of a parent page +@return true if there is child/parent relationship */ +bool +rtr_check_same_block( +/*=================*/ + dict_index_t* index, /*!< in: index tree */ + btr_cur_t* cur, /*!< in/out: position at the parent entry + pointing to the child if successful */ + buf_block_t* parentb,/*!< in: parent page to check */ + buf_block_t* childb, /*!< in: child Page */ + mem_heap_t* heap); /*!< in: memory heap */ + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_write_mbr( +/*==========*/ + byte* data, /*!< out: data */ + const rtr_mbr_t* mbr); /*!< in: data */ + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_read_mbr( +/*==========*/ + const byte* data, /*!< in: data */ + rtr_mbr_t* mbr); /*!< out: data */ + +/**************************************************************//** +Check whether a discarding page is in anyone's search path */ +void +rtr_check_discard_page( +/*===================*/ + dict_index_t* index, /*!< in: index */ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + buf_block_t* block); /*!< in: block of page to be discarded */ + +/********************************************************************//** +Reinitialize a RTree search info */ +UNIV_INLINE +void +rtr_info_reinit_in_cursor( +/************************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + dict_index_t* index, /*!< in: index struct */ + bool need_prdt); /*!< in: Whether predicate lock is + needed */ + +/** Estimates the number of rows in a given area. +@param[in] index index +@param[in] tuple range tuple containing mbr, may also be empty tuple +@param[in] mode search mode +@return estimated number of rows */ +ha_rows +rtr_estimate_n_rows_in_range( + dict_index_t* index, + const dtuple_t* tuple, + page_cur_mode_t mode); + +#include "gis0rtree.ic" +#endif /*!< gis0rtree.h */ diff --git a/storage/innobase/include/gis0rtree.ic b/storage/innobase/include/gis0rtree.ic new file mode 100644 index 00000000..1b53caa3 --- /dev/null +++ b/storage/innobase/include/gis0rtree.ic @@ -0,0 +1,242 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include gis0rtree.h +R-tree Inline code + +Created 2013/03/27 Jimmy Yang and Allen Lai +***********************************************************************/ + +/**************************************************************//** +Sets the child node mbr in a node pointer. */ +UNIV_INLINE +void +rtr_page_cal_mbr( +/*=============*/ + const dict_index_t* index, /*!< in: index */ + const buf_block_t* block, /*!< in: buffer block */ + rtr_mbr_t* rtr_mbr,/*!< out: MBR encapsulates the page */ + mem_heap_t* heap) /*!< in: heap for the memory + allocation */ +{ + page_t* page; + rec_t* rec; + const byte* field; + ulint len; + rec_offs* offsets = NULL; + double bmin, bmax; + double* amin; + double* amax; + ulint inc = 0; + double* mbr; + + rtr_mbr->xmin = DBL_MAX; + rtr_mbr->ymin = DBL_MAX; + rtr_mbr->xmax = -DBL_MAX; + rtr_mbr->ymax = -DBL_MAX; + + mbr = reinterpret_cast<double*>(rtr_mbr); + + page = buf_block_get_frame(block); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + do { + /* The mbr address is in the first field. */ + field = rec_get_nth_field(rec, offsets, 0, &len); + + ut_ad(len == DATA_MBR_LEN); + inc = 0; + for (unsigned i = 0; i < SPDIMS; i++) { + bmin = mach_double_read(field + inc); + bmax = mach_double_read(field + inc + sizeof(double)); + + amin = mbr + i * SPDIMS; + amax = mbr + i * SPDIMS + 1; + + if (*amin > bmin) + *amin = bmin; + if (*amax < bmax) + *amax = bmax; + + inc += 2 * sizeof(double); + } + + rec = page_rec_get_next(rec); + + if (rec == NULL) { + break; + } + } while (!page_rec_is_supremum(rec)); +} + +/**************************************************************//** +push a nonleaf index node to the search path */ +UNIV_INLINE +void +rtr_non_leaf_stack_push( +/*====================*/ + rtr_node_path_t* path, /*!< in/out: search path */ + uint32_t pageno, /*!< in: pageno to insert */ + node_seq_t seq_no, /*!< in: Node sequence num */ + ulint level, /*!< in: index page level */ + uint32_t child_no, /*!< in: child page no */ + btr_pcur_t* cursor, /*!< in: position cursor */ + double mbr_inc) /*!< in: MBR needs to be + enlarged */ +{ + node_visit_t insert_val; + + insert_val.page_no = pageno; + insert_val.seq_no = seq_no; + insert_val.level = level; + insert_val.child_no = child_no; + insert_val.cursor = cursor; + insert_val.mbr_inc = mbr_inc; + + path->push_back(insert_val); + +#ifdef RTR_SEARCH_DIAGNOSTIC + fprintf(stderr, "INNODB_RTR: Push page %d, level %d, seq %d" + " to search stack \n", + static_cast<int>(pageno), static_cast<int>(level), + static_cast<int>(seq_no)); +#endif /* RTR_SEARCH_DIAGNOSTIC */ +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_write_mbr( +/*==========*/ + byte* data, /*!< out: data */ + const rtr_mbr_t* mbr) /*!< in: data */ +{ + const double* my_mbr = reinterpret_cast<const double*>(mbr); + + for (unsigned i = 0; i < SPDIMS * 2; i++) { + mach_double_write(data + i * sizeof(double), my_mbr[i]); + } +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_read_mbr( +/*==========*/ + const byte* data, /*!< in: data */ + rtr_mbr_t* mbr) /*!< out: MBR */ +{ + for (unsigned i = 0; i < SPDIMS * 2; i++) { + (reinterpret_cast<double*>(mbr))[i] = mach_double_read( + data + + i * sizeof(double)); + } +} + +/*********************************************************//** +Returns the R-Tree node stored in the parent search path +@return pointer to R-Tree cursor component in the parent path, +NULL if parent path is empty or index is larger than num of items contained */ +UNIV_INLINE +node_visit_t* +rtr_get_parent_node( +/*================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert) /*!< in: whether it is insert */ +{ + ulint num; + ulint tree_height = btr_cur->tree_height; + node_visit_t* found_node = NULL; + + if (level >= tree_height) { + return(NULL); + } + + mutex_enter(&btr_cur->rtr_info->rtr_path_mutex); + + num = btr_cur->rtr_info->parent_path->size(); + + if (!num) { + mutex_exit(&btr_cur->rtr_info->rtr_path_mutex); + return(NULL); + } + + if (is_insert) { + ulint idx = tree_height - level - 1; + ut_ad(idx < num); + + found_node = &(*btr_cur->rtr_info->parent_path)[idx]; + } else { + node_visit_t* node; + + while (num > 0) { + node = &(*btr_cur->rtr_info->parent_path)[num - 1]; + + if (node->level == level) { + found_node = node; + break; + } + num--; + } + } + + mutex_exit(&btr_cur->rtr_info->rtr_path_mutex); + + return(found_node); +} + +/*********************************************************//** +Returns the R-Tree cursor stored in the parent search path +@return pointer to R-Tree cursor component */ +UNIV_INLINE +btr_pcur_t* +rtr_get_parent_cursor( +/*==================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert) /*!< in: whether insert operation */ +{ + node_visit_t* found_node = rtr_get_parent_node( + btr_cur, level, is_insert); + + return((found_node) ? found_node->cursor : NULL); +} + +/********************************************************************//** +Reinitialize a R-Tree search info in btr_cur_t */ +UNIV_INLINE +void +rtr_info_reinit_in_cursor( +/************************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + dict_index_t* index, /*!< in: index struct */ + bool need_prdt) /*!< in: Whether predicate lock is + needed */ +{ + rtr_clean_rtr_info(cursor->rtr_info, false); + rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true); +} diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h new file mode 100644 index 00000000..55944bfc --- /dev/null +++ b/storage/innobase/include/gis0type.h @@ -0,0 +1,152 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include gis0type.h +R-tree header file + +Created 2013/03/27 Jimmy Yang +***********************************************************************/ + +#ifndef gis0type_h +#define gis0type_h + +#include "buf0buf.h" +#include "data0type.h" +#include "data0types.h" +#include "dict0types.h" +#include "ut0vec.h" +#include "gis0geo.h" + +#include <vector> +#include <forward_list> + +/** Node Sequence Number. Only updated when page splits */ +typedef uint32_t node_seq_t; + +/* RTree internal non-leaf Nodes to be searched, from root to leaf */ +struct node_visit_t { + uint32_t page_no; /*!< the page number */ + node_seq_t seq_no; /*!< the SSN (split sequence number */ + ulint level; /*!< the page's index level */ + uint32_t child_no; /*!< child page num if for parent + recording */ + btr_pcur_t* cursor; /*!< cursor structure if we positioned + FIXME: there is no need to use whole + btr_pcur_t, just the position related + members */ + double mbr_inc; /*!< whether this node needs to be + enlarged for insertion */ +}; + +typedef std::vector<node_visit_t, ut_allocator<node_visit_t> > rtr_node_path_t; + +typedef struct rtr_rec { + rec_t* r_rec; /*!< matched record */ + bool locked; /*!< whether the record locked */ +} rtr_rec_t; + +typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> > rtr_rec_vector; + +/* Structure for matched records on the leaf page */ +typedef struct matched_rec { + byte* bufp; /*!< aligned buffer point */ + byte rec_buf[UNIV_PAGE_SIZE_MAX * 2]; + /*!< buffer used to copy matching rec */ + buf_block_t block; /*!< the shadow buffer block */ + ulint used; /*!< memory used */ + rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */ + ib_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs + vector */ + bool valid; /*!< whether result in matched_recs + or this search is valid (page not + dropped) */ + bool locked; /*!< whether these recs locked */ +} matched_rec_t; + +/* In memory representation of a minimum bounding rectangle */ +typedef struct rtr_mbr { + double xmin; /*!< minimum on x */ + double xmax; /*!< maximum on x */ + double ymin; /*!< minimum on y */ + double ymax; /*!< maximum on y */ +} rtr_mbr_t; + +/* Maximum index level for R-Tree, this is consistent with BTR_MAX_LEVELS */ +#define RTR_MAX_LEVELS 100 + +/* Number of pages we latch at leaf level when there is possible Tree +modification (split, shrink), we always latch left, current +and right pages */ +#define RTR_LEAF_LATCH_NUM 3 + +/** Vectors holding the matching internal pages/nodes and leaf records */ +typedef struct rtr_info{ + rtr_node_path_t*path; /*!< vector holding matching pages */ + rtr_node_path_t*parent_path; + /*!< vector holding parent pages during + search */ + matched_rec_t* matches;/*!< struct holding matching leaf records */ + ib_mutex_t rtr_path_mutex; + /*!< mutex protect the "path" vector */ + buf_block_t* tree_blocks[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM]; + /*!< tracking pages that would be locked + at leaf level, for future free */ + ulint tree_savepoints[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM]; + /*!< savepoint used to release latches/blocks + on each level and leaf level */ + rtr_mbr_t mbr; /*!< the search MBR */ + que_thr_t* thr; /*!< the search thread */ + mem_heap_t* heap; /*!< memory heap */ + btr_cur_t* cursor; /*!< cursor used for search */ + dict_index_t* index; /*!< index it is searching */ + bool need_prdt_lock; + /*!< whether we will need predicate lock + the tree */ + bool need_page_lock; + /*!< whether we will need predicate page lock + the tree */ + bool allocated;/*!< whether this structure is allocate or + on stack */ + bool mbr_adj;/*!< whether mbr will need to be enlarged + for an insertion operation */ + bool fd_del; /*!< found deleted row */ + const dtuple_t* search_tuple; + /*!< search tuple being used */ + page_cur_mode_t search_mode; + /*!< current search mode */ +} rtr_info_t; + +/* Tracking structure for all ongoing search for an index */ +struct rtr_info_track_t { + /** Active search info */ + std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active; + ib_mutex_t rtr_active_mutex; + /*!< mutex to protect + rtr_active */ +}; + +/* This is to record the record movement between pages. Used for corresponding +lock movement */ +typedef struct rtr_rec_move { + rec_t* old_rec; /*!< record being moved in old page */ + rec_t* new_rec; /*!< new record location */ + bool moved; /*!< whether lock are moved too */ +} rtr_rec_move_t; +#endif /*!< gis0rtree.h */ diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h new file mode 100644 index 00000000..561c3225 --- /dev/null +++ b/storage/innobase/include/ha0ha.h @@ -0,0 +1,60 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0ha.h +The hash table interface for the adaptive hash index + +Created 8/18/1994 Heikki Tuuri +*******************************************************/ + +#ifndef ha0ha_h +#define ha0ha_h + +#include "hash0hash.h" +#include "page0types.h" +#include "buf0types.h" +#include "rem0types.h" + +#ifdef BTR_CUR_HASH_ADAPT +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: folded value of the searched data */ + +/** The hash table external chain node */ +struct ha_node_t { + ulint fold; /*!< fold value for the data */ + ha_node_t* next; /*!< next chain node or NULL if none */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block; /*!< buffer block containing the data, or NULL */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data; /*!< pointer to the data */ +}; + +#include "ha0ha.ic" +#endif /* BTR_CUR_HASH_ADAPT */ + +#endif diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic new file mode 100644 index 00000000..0b256257 --- /dev/null +++ b/storage/innobase/include/ha0ha.ic @@ -0,0 +1,154 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ha0ha.ic +The hash table interface for the adaptive hash index + +Created 8/18/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef BTR_CUR_HASH_ADAPT +#include "btr0types.h" + +/******************************************************************//** +Gets a hash node data. +@return pointer to the data */ +UNIV_INLINE +const rec_t* +ha_node_get_data( +/*=============*/ + const ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->data); +} + +/******************************************************************//** +Sets hash node data. */ +UNIV_INLINE +void +ha_node_set_data_func( +/*==================*/ + ha_node_t* node, /*!< in: hash chain node */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data) /*!< in: pointer to the data */ +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = data; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/******************************************************************//** +Gets the next node in a hash chain. +@return next node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_next( +/*==============*/ + const ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->next); +} + +/******************************************************************//** +Gets the first node in a hash chain. +@return first node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_first( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value determining the chain */ +{ + return static_cast<ha_node_t*>(table->array[table->calc_hash(fold)].node); +} + +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: folded value of the searched data */ +{ + ut_ad(btr_search_enabled); + + for (const ha_node_t* node = ha_chain_get_first(table, fold); + node != NULL; + node = ha_chain_get_next(node)) { + + if (node->fold == fold) { + + return(node->data); + } + } + + return(NULL); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data. +@return pointer to the hash table node, NULL if not found in the table */ +UNIV_INLINE +ha_node_t* +ha_search_with_data( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data) /*!< in: pointer to the data */ +{ + ha_node_t* node; + + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->data == data) { + + return(node); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +#endif /* BTR_CUR_HASH_ADAPT */ diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h new file mode 100644 index 00000000..db23ddc6 --- /dev/null +++ b/storage/innobase/include/ha0storage.h @@ -0,0 +1,137 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.h +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#ifndef ha0storage_h +#define ha0storage_h + +#include "univ.i" + +/** This value is used by default by ha_storage_create(). More memory +is allocated later when/if it is needed. */ +#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024 + +/** This value is used by default by ha_storage_create(). It is a +constant per ha_storage's lifetime. */ +#define HA_STORAGE_DEFAULT_HASH_CELLS 4096 + +/** Hash storage */ +struct ha_storage_t; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells); /*!< in: initial number of cells + in the hash table */ + +/*******************************************************************//** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". +@return pointer to the copy */ +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /*!< in/out: hash storage */ + const void* data, /*!< in: data to store */ + ulint data_len, /*!< in: data length */ + ulint memlim); /*!< in: memory limit to obey */ + +/*******************************************************************//** +Same as ha_storage_put_memlim() but without memory limit. +@param storage in/out: hash storage +@param data in: data to store +@param data_len in: data length +@return pointer to the copy of the string */ +#define ha_storage_put(storage, data, data_len) \ + ha_storage_put_memlim((storage), (data), (data_len), 0) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy. If the +same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@return pointer to the copy of the string */ +#define ha_storage_put_str(storage, str) \ + ((const char*) ha_storage_put((storage), (str), strlen(str) + 1)) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy obeying +a memory limit. +If the same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@param memlim in: memory limit to obey +@return pointer to the copy of the string */ +#define ha_storage_put_str_memlim(storage, str, memlim) \ + ((const char*) ha_storage_put_memlim((storage), (str), \ + strlen(str) + 1, (memlim))) + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage); /*!< in/out: hash storage */ + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage); /*!< in, own: hash storage */ + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage); /*!< in: hash storage */ + +#include "ha0storage.ic" + +#endif /* ha0storage_h */ diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic new file mode 100644 index 00000000..df9679cf --- /dev/null +++ b/storage/innobase/include/ha0storage.ic @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.ic +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 24, 2007 Vasil Dimov +*******************************************************/ + +#include "hash0hash.h" +#include "mem0mem.h" + +/** Hash storage for strings */ +struct ha_storage_t { + mem_heap_t* heap; /*!< memory heap from which memory is + allocated */ + hash_table_t hash; /*!< hash table used to avoid + duplicates */ +}; + +/** Objects of this type are stored in ha_storage_t */ +struct ha_storage_node_t { + ulint data_len;/*!< length of the data */ + const void* data; /*!< pointer to data */ + ha_storage_node_t* next; /*!< next node in hash chain */ +}; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells) /*!< in: initial number of cells + in the hash table */ +{ + ha_storage_t* storage; + mem_heap_t* heap; + + if (initial_heap_bytes == 0) { + + initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES; + } + + if (initial_hash_cells == 0) { + + initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS; + } + + /* we put "storage" within "storage->heap" */ + + heap = mem_heap_create(sizeof(ha_storage_t) + + initial_heap_bytes); + + storage = (ha_storage_t*) mem_heap_alloc(heap, + sizeof(ha_storage_t)); + + storage->heap = heap; + storage->hash.create(initial_hash_cells); + + return(storage); +} + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage) /*!< in/out: hash storage */ +{ + ha_storage_t temp_storage; + + temp_storage.heap = (*storage)->heap; + temp_storage.hash = (*storage)->hash; + + temp_storage.hash.clear(); + mem_heap_empty(temp_storage.heap); + + *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap, + sizeof(ha_storage_t)); + + (*storage)->heap = temp_storage.heap; + (*storage)->hash = temp_storage.hash; +} + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage) /*!< in, own: hash storage */ +{ + storage->hash.free(); + mem_heap_free(storage->heap); +} + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage) /*!< in: hash storage */ +{ + ulint ret; + + ret = mem_heap_get_size(storage->heap); + + /* this assumes hash->heap and hash->heaps are NULL */ + ret += sizeof(hash_table_t); + ret += sizeof(hash_cell_t) * storage->hash.n_cells; + + return(ret); +} diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h new file mode 100644 index 00000000..453f9e02 --- /dev/null +++ b/storage/innobase/include/ha_prototypes.h @@ -0,0 +1,522 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ha_prototypes.h +Prototypes for global functions in ha_innodb.cc that are called by +InnoDB C code. + +NOTE: This header is intended to insulate InnoDB from SQL names and functions. +Do not include any headers other than univ.i into this unless they are very +simple headers. +************************************************************************/ + +#ifndef HA_INNODB_PROTOTYPES_H +#define HA_INNODB_PROTOTYPES_H + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +/* Forward declarations */ +class THD; +class Field; + +// JAN: TODO missing features: +#undef MYSQL_FT_INIT_EXT +#undef MYSQL_PFS +#undef MYSQL_STORE_FTS_DOC_ID + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name); /*!< in: concatenation of + database name, path separator, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + +/** Quote a standard SQL identifier like tablespace, index or column name. +@param[in] file output stream +@param[in] trx InnoDB transaction, or NULL +@param[in] id identifier to quote */ +void +innobase_quote_identifier( + FILE* file, + trx_t* trx, + const char* id); + +/** Quote an standard SQL identifier like tablespace, index or column name. +Return the string as an std:string object. +@param[in] trx InnoDB transaction, or NULL +@param[in] id identifier to quote +@return a std::string with id properly quoted. */ +std::string +innobase_quote_identifier( + trx_t* trx, + const char* id); + +/*****************************************************************//** +Convert a table name to the MySQL system_charset_info (UTF-8). +@return pointer to the end of buf */ +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: table name to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd); /*!< in: MySQL connection thread, or NULL */ + +/******************************************************************//** +Returns true if the thread is the replication thread on the slave +server. +@return true if thd is the replication thread */ +ibool +thd_is_replication_slave_thread( +/*============================*/ + THD* thd); /*!< in: thread handle */ + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd); /*!< in: thread handle */ + +/** +Get high resolution timestamp for the current query start time. + +@retval timestamp in microseconds precision +*/ +unsigned long long thd_query_start_micro(const MYSQL_THD thd); + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: pointer to a MySQL THD object */ + uint max_query_len); /*!< in: max query length to print, or 0 to + use the default max length */ + +/** Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. +@param[out] unsigned_flag DATA_UNSIGNED if an 'unsigned type'; +at least ENUM and SET, and unsigned integer types are 'unsigned types' +@param[in] f MySQL Field +@return DATA_BINARY, DATA_VARCHAR, ... */ +uint8_t +get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field); + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + unsigned*mbminlen, /*!< out: minimum length of a char (in bytes) */ + unsigned*mbmaxlen); /*!< out: maximum length of a char (in bytes) */ + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a<b, >1 if a>b */ +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b); /*!< in: second string to compare */ + +/** Strip dir name from a full path name and return only the file name +@param[in] path_name full path name +@return file name or "null" if no file name */ +const char* +innobase_basename( + const char* path_name); + +/******************************************************************//** +Converts an identifier to a table name. */ +void +innobase_convert_from_table_id( +/*===========================*/ + CHARSET_INFO* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; should + be at least 5 * strlen(to) + 1 */ +/******************************************************************//** +Converts an identifier to UTF-8. */ +void +innobase_convert_from_id( +/*=====================*/ + CHARSET_INFO* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; + should be at least 3 * strlen(to) + 1 */ +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +void +innobase_casedn_str( +/*================*/ + char* a); /*!< in/out: string to put in lower case */ + +#ifdef WITH_WSREP +void wsrep_innobase_kill_one_trx(THD *bf_thd, trx_t *victim_trx, bool signal); +ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, + unsigned char* str, ulint str_length, + ulint buf_length); +#endif /* WITH_WSREP */ + +extern "C" struct charset_info_st *thd_charset(THD *thd); + +/** Determines the current SQL statement. +Thread unsafe, can only be called from the thread owning the THD. +@param[in] thd MySQL thread handle +@param[out] length Length of the SQL statement +@return SQL statement string */ +const char* +innobase_get_stmt_unsafe( + THD* thd, + size_t* length); + +/******************************************************************//** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. +@return number of bytes occupied by the first n characters */ +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + ulint charset_id, /*!< in: character set id */ + ulint prefix_len, /*!< in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /*!< in: length of the string in bytes */ + const char* str); /*!< in: character string */ + +/** Get status of innodb_tmpdir. +@param[in] thd thread handle, or NULL to query + the global innodb_tmpdir. +@retval NULL if innodb_tmpdir="" */ +UNIV_INTERN +const char* +thd_innodb_tmpdir( + THD* thd); + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +ulong +thd_lock_wait_timeout( +/*==================*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ +/** Get status of innodb_tmpdir. +@param[in] thd thread handle, or NULL to query + the global innodb_tmpdir. +@retval NULL if innodb_tmpdir="" */ +const char* +thd_innodb_tmpdir( + THD* thd); + +/**********************************************************************//** +Get the current setting of the table_cache_size global parameter. We do +a dirty read because for one there is no synchronization object and +secondly there is little harm in doing so even if we get a torn read. +@return SQL statement string */ +ulint +innobase_get_table_cache_size(void); +/*===============================*/ + +/**********************************************************************//** +Get the current setting of the lower_case_table_names global parameter from +mysqld.cc. We do a dirty read because for one there is no synchronization +object and secondly there is little harm in doing so even if we get a torn +read. +@return value of lower_case_table_names */ +ulint +innobase_get_lower_case_table_names(void); +/*=====================================*/ + +/******************************************************************//** +compare two character string case insensitively according to their charset. */ +int +innobase_fts_text_case_cmp( +/*=======================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +bool +thd_trx_is_read_only( +/*=================*/ + THD* thd); /*!< in/out: thread handle */ + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd); /*!< in: thread handle, or NULL */ + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table name +to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name); /*!< in: table name to format */ + +/** Corresponds to Sql_condition:enum_warning_level. */ +enum ib_log_level_t { + IB_LOG_LEVEL_INFO, + IB_LOG_LEVEL_WARN, + IB_LOG_LEVEL_ERROR, + IB_LOG_LEVEL_FATAL +}; + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + MY_ATTRIBUTE((format(printf, 4, 5))); + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...); /*!< Args */ + +extern const char* TROUBLESHOOTING_MSG; +extern const char* TROUBLESHOOT_DATADICT_MSG; +extern const char* BUG_REPORT_MSG; +extern const char* FORCE_RECOVERY_MSG; +extern const char* OPERATING_SYSTEM_ERROR_MSG; +extern const char* FOREIGN_KEY_CONSTRAINTS_MSG; +extern const char* SET_TRANSACTION_MSG; +extern const char* INNODB_PARAMETERS_MSG; + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +const char* +server_get_hostname(); +/*=================*/ + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ + MY_ATTRIBUTE((pure, warn_unused_result)); + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors); /* out: error return */ +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +The input to this function is an identifier in charset my_charset_filename. +return true when length of identifier is too long. */ +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id); /* in: identifier to check. it must belong + to charset my_charset_filename */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors); /* out: error return */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_filename_charset( +/*=================================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len); /* in: length of 'to', in bytes */ + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +UNIV_INTERN +void +ib_push_warning( + trx_t* trx, /*!< in: trx */ + dberr_t error, /*!< in: error code to push as warning */ + const char *format,/*!< in: warning message */ + ...); + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +UNIV_INTERN +void +ib_push_warning( + void* ithd, /*!< in: thd */ + dberr_t error, /*!< in: error code to push as warning */ + const char *format,/*!< in: warning message */ + ...); + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +UNIV_INTERN +void +ib_foreign_warn( + trx_t* trx, /*!< in: trx */ + dberr_t error, /*!< in: error code to push as warning */ + const char *table_name, + const char *format,/*!< in: warning message */ + ...); + +/*****************************************************************//** +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. */ +void +normalize_table_name_c_low( +/*=======================*/ + char* norm_name, /*!< out: normalized name as a + null-terminated string */ + const char* name, /*!< in: table name string */ + ibool set_lower_case); /*!< in: TRUE if we want to set + name to lower case */ +/** Update the system variable with the given value of the InnoDB +buffer pool size. +@param[in] buf_pool_size given value of buffer pool size.*/ +void +innodb_set_buf_pool_size(ulonglong buf_pool_size); + +/** Create a MYSQL_THD for a background thread and mark it as such. +@param name thread info for SHOW PROCESSLIST +@return new MYSQL_THD */ +MYSQL_THD +innobase_create_background_thd(const char* name); + +/** Destroy a background purge thread THD. +@param[in] thd MYSQL_THD to destroy */ +void +innobase_destroy_background_thd(MYSQL_THD); + +/** Close opened tables, free memory, delete items for a MYSQL_THD. +@param[in] thd MYSQL_THD to reset */ +void +innobase_reset_background_thd(MYSQL_THD); + +#endif /* !UNIV_INNOCHECKSUM */ +#endif /* HA_INNODB_PROTOTYPES_H */ diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h new file mode 100644 index 00000000..add983a0 --- /dev/null +++ b/storage/innobase/include/handler0alter.h @@ -0,0 +1,108 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/handler0alter.h +Smart ALTER TABLE +*******************************************************/ + +#include "rem0types.h" + +/*************************************************************//** +Copies an InnoDB record to table->record[0]. */ +void +innobase_rec_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ + MY_ATTRIBUTE((nonnull)); + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ + MY_ATTRIBUTE((nonnull)); + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ + MY_ATTRIBUTE((nonnull)); + +/** Generate the next autoinc based on a snapshot of the session +auto_increment_increment and auto_increment_offset variables. */ +struct ib_sequence_t { + + /** + @param thd the session + @param start_value the lower bound + @param max_value the upper bound (inclusive) */ + ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value); + + /** Postfix increment + @return the value to insert */ + ulonglong operator++(int) UNIV_NOTHROW; + + /** Check if the autoinc "sequence" is exhausted. + @return true if the sequence is exhausted */ + bool eof() const UNIV_NOTHROW + { + return(m_eof); + } + + /** + @return the next value in the sequence */ + ulonglong last() const UNIV_NOTHROW + { + ut_ad(m_next_value > 0); + + return(m_next_value); + } + + /** @return maximum column value + @retval 0 if not adding AUTO_INCREMENT column */ + ulonglong max_value() const { return m_max_value; } + +private: + /** Maximum value if adding an AUTO_INCREMENT column, else 0 */ + ulonglong m_max_value; + + /** Value of auto_increment_increment */ + ulong m_increment; + + /** Value of auto_increment_offset */ + ulong m_offset; + + /** Next value in the sequence */ + ulonglong m_next_value; + + /** true if no more values left in the sequence */ + bool m_eof; +}; diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h new file mode 100644 index 00000000..981ff5a0 --- /dev/null +++ b/storage/innobase/include/hash0hash.h @@ -0,0 +1,236 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/hash0hash.h +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "ut0rnd.h" + +struct hash_table_t; +struct hash_cell_t{ + void* node; /*!< hash chain node, NULL if none */ +}; +typedef void* hash_node_t; + +/*******************************************************************//** +Inserts a struct to a hash table. */ + +#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + (DATA)->NAME = NULL;\ +\ + cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \ +\ + if (cell3333->node == NULL) {\ + cell3333->node = DATA;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != NULL) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + }\ +\ + struct3333->NAME = DATA;\ + }\ +} while (0) + +/*******************************************************************//** +Inserts a struct to the head of hash table. */ + +#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA) \ +do { \ + hash_cell_t* cell3333; \ + TYPE* struct3333; \ + \ + (DATA)->NAME = NULL; \ + \ + cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \ + \ + if (cell3333->node == NULL) { \ + cell3333->node = DATA; \ + DATA->NAME = NULL; \ + } else { \ + struct3333 = (TYPE*) cell3333->node; \ + \ + DATA->NAME = struct3333; \ + \ + cell3333->node = DATA; \ + } \ +} while (0) +#ifdef UNIV_HASH_DEBUG +# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1) +# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1 +#else +# define HASH_ASSERT_VALID(DATA) do {} while (0) +# define HASH_INVALIDATE(DATA, NAME) do {} while (0) +#endif + +/*******************************************************************//** +Deletes a struct from a hash table. */ + +#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \ +\ + if (cell3333->node == DATA) {\ + HASH_ASSERT_VALID(DATA->NAME);\ + cell3333->node = DATA->NAME;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != DATA) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + ut_a(struct3333);\ + }\ +\ + struct3333->NAME = DATA->NAME;\ + }\ + HASH_INVALIDATE(DATA, NAME);\ +} while (0) + +#define HASH_REPLACE(TYPE, NAME, TABLE, FOLD, DATA_OLD, DATA_NEW) \ + do { \ + (DATA_NEW)->NAME = (DATA_OLD)->NAME; \ + \ + hash_cell_t& cell3333 \ + = (TABLE)->array[(TABLE)->calc_hash(FOLD)]; \ + TYPE** struct3333 = (TYPE**)&cell3333.node; \ + while (*struct3333 != DATA_OLD) { \ + struct3333 = &((*struct3333)->NAME); \ + } \ + *struct3333 = DATA_NEW; \ + } while (0) +/*******************************************************************//** +Gets the first struct in a hash chain, NULL if none. */ + +#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node + +/*******************************************************************//** +Gets the next struct in a hash chain, NULL if none. */ + +#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME) + +/********************************************************************//** +Looks for a struct in a hash table. */ +#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\ +{\ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \ + HASH_ASSERT_VALID(DATA);\ +\ + while ((DATA) != NULL) {\ + ASSERTION;\ + if (TEST) {\ + break;\ + } else {\ + HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\ + }\ + }\ +} + +/********************************************************************//** +Looks for an item in all hash buckets. */ +#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \ +do { \ + ulint i3333; \ + \ + for (i3333 = (TABLE)->n_cells; i3333--; ) { \ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \ + \ + while ((DATA) != NULL) { \ + HASH_ASSERT_VALID(DATA); \ + ASSERTION; \ + \ + if (TEST) { \ + break; \ + } \ + \ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \ + } \ + \ + if ((DATA) != NULL) { \ + break; \ + } \ + } \ +} while (0) + +/****************************************************************//** +Move all hash table entries from OLD_TABLE to NEW_TABLE. */ + +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \ +do {\ + ulint i2222;\ + ulint cell_count2222;\ +\ + cell_count2222 = (OLD_TABLE)->n_cells; \ +\ + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\ + NODE_TYPE* node2222 = static_cast<NODE_TYPE*>(\ + HASH_GET_FIRST((OLD_TABLE), i2222));\ +\ + while (node2222) {\ + NODE_TYPE* next2222 = static_cast<NODE_TYPE*>(\ + node2222->PTR_NAME);\ + ulint fold2222 = FOLD_FUNC(node2222);\ +\ + HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\ + fold2222, node2222);\ +\ + node2222 = next2222;\ + }\ + }\ +} while (0) + +/** Hash table with singly-linked overflow lists */ +struct hash_table_t +{ + /** number of elements in array (a prime number) */ + ulint n_cells; + /** the hash array */ + hash_cell_t *array; + + /** Create the hash table. + @param n the lower bound of n_cells */ + void create(ulint n) + { + n_cells= ut_find_prime(n); + array= static_cast<hash_cell_t*>(ut_zalloc_nokey(n_cells * sizeof *array)); + } + + /** Clear the hash table. */ + void clear() { memset(array, 0, n_cells * sizeof *array); } + + /** Free the hash table. */ + void free() { ut_free(array); array= nullptr; } + + ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); } +}; diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h new file mode 100644 index 00000000..81ab7566 --- /dev/null +++ b/storage/innobase/include/ib0mutex.h @@ -0,0 +1,773 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ib0mutex.h +Policy based mutexes. + +Created 2013-03-26 Sunny Bains. +***********************************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +#ifndef ib0mutex_h +#define ib0mutex_h + +#include "my_cpu.h" +#include "os0event.h" +#include "sync0arr.h" + +/** OS mutex for tracking lock/unlock for debugging */ +template <template <typename> class Policy> +struct OSTrackMutex { + + typedef Policy<OSTrackMutex> MutexPolicy; + + explicit OSTrackMutex(bool destroy_mutex_at_exit = true) + UNIV_NOTHROW + { + ut_d(m_freed = true); + ut_d(m_locked = false); + ut_d(m_destroy_at_exit = destroy_mutex_at_exit); + } + + ~OSTrackMutex() UNIV_NOTHROW + { + ut_ad(!m_destroy_at_exit || !m_locked); + } + + /** Initialise the mutex. */ + void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW + { + ut_ad(m_freed); + ut_ad(!m_locked); + + m_mutex.init(); + + ut_d(m_freed = false); + } + + /** Destroy the mutex */ + void destroy() UNIV_NOTHROW + { + ut_ad(!m_locked); + ut_ad(!m_freed); + + m_mutex.destroy(); + + ut_d(m_freed = true); + } + + /** Release the mutex. */ + void exit() UNIV_NOTHROW + { + ut_ad(m_locked); + ut_d(m_locked = false); + ut_ad(!m_freed); + + m_mutex.exit(); + } + + /** Acquire the mutex. */ + void enter(uint32_t, uint32_t, const char*, uint32_t) + UNIV_NOTHROW + { + ut_ad(!m_freed); + + m_mutex.enter(); + + ut_ad(!m_locked); + ut_d(m_locked = true); + } + + /** @return true if locking succeeded */ + bool try_lock() UNIV_NOTHROW + { + ut_ad(!m_freed); + + bool locked = m_mutex.try_lock(); + + if (locked) { + ut_ad(!m_locked); + ut_d(m_locked = locked); + } + + return(locked); + } + + /** @return non-const version of the policy */ + MutexPolicy& policy() + UNIV_NOTHROW + { + return(m_policy); + } + + /** @return the const version of the policy */ + const MutexPolicy& policy() const + UNIV_NOTHROW + { + return(m_policy); + } + +private: +#ifdef UNIV_DEBUG + /** true if the mutex has not be initialized */ + bool m_freed; + + /** true if the mutex has been locked. */ + bool m_locked; + + /** Do/Dont destroy mutex at exit */ + bool m_destroy_at_exit; +#endif /* UNIV_DEBUG */ + + /** OS Mutex instance */ + OSMutex m_mutex; + + /** Policy data */ + MutexPolicy m_policy; +}; + + +#ifdef __linux__ + +#include <linux/futex.h> +#include <sys/syscall.h> + +/** Mutex implementation that used the Linux futex. */ +template <template <typename> class Policy> +struct TTASFutexMutex { + + typedef Policy<TTASFutexMutex> MutexPolicy; + + TTASFutexMutex() UNIV_NOTHROW + : + m_lock_word(MUTEX_STATE_UNLOCKED) + { + /* Check that lock_word is aligned. */ + ut_ad(!((ulint) &m_lock_word % sizeof(ulint))); + } + + ~TTASFutexMutex() + { + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); + } + + /** Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. */ + void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW + { + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); + } + + /** Destroy the mutex. */ + void destroy() UNIV_NOTHROW + { + /* The destructor can be called at shutdown. */ + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); + } + + /** Acquire the mutex. + @param[in] max_spins max number of spins + @param[in] max_delay max delay per spin */ + void enter(uint32_t max_spins, uint32_t max_delay, + const char*, uint32_t) UNIV_NOTHROW + { + uint32_t n_spins, n_waits; + + for (n_spins= 0; n_spins < max_spins; n_spins++) { + if (try_lock()) { + m_policy.add(n_spins, 0); + return; + } + + ut_delay(max_delay); + } + + for (n_waits= 0;; n_waits++) { + if (m_lock_word.exchange(MUTEX_STATE_WAITERS, + std::memory_order_acquire) + == MUTEX_STATE_UNLOCKED) { + break; + } + + syscall(SYS_futex, &m_lock_word, + FUTEX_WAIT_PRIVATE, MUTEX_STATE_WAITERS, + 0, 0, 0); + } + + m_policy.add(n_spins, n_waits); + } + + /** Release the mutex. */ + void exit() UNIV_NOTHROW + { + if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED, + std::memory_order_release) + == MUTEX_STATE_WAITERS) { + syscall(SYS_futex, &m_lock_word, FUTEX_WAKE_PRIVATE, + 1, 0, 0, 0); + } + } + + /** Try and lock the mutex. + @return true if successful */ + bool try_lock() UNIV_NOTHROW + { + int32 oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + /** @return non-const version of the policy */ + MutexPolicy& policy() UNIV_NOTHROW + { + return(m_policy); + } + + /** @return const version of the policy */ + const MutexPolicy& policy() const UNIV_NOTHROW + { + return(m_policy); + } +private: + /** Policy data */ + MutexPolicy m_policy; + + /** lock_word is the target of the atomic test-and-set instruction + when atomic operations are enabled. */ + std::atomic<int32> m_lock_word; +}; + +#endif /* __linux__ */ + +template <template <typename> class Policy> +struct TTASMutex { + + typedef Policy<TTASMutex> MutexPolicy; + + TTASMutex() UNIV_NOTHROW + : + m_lock_word(MUTEX_STATE_UNLOCKED) + { + /* Check that lock_word is aligned. */ + ut_ad(!((ulint) &m_lock_word % sizeof(ulint))); + } + + ~TTASMutex() + { + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); + } + + /** Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. */ + void init(latch_id_t) UNIV_NOTHROW + { + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); + } + + /** Destroy the mutex. */ + void destroy() UNIV_NOTHROW + { + /* The destructor can be called at shutdown. */ + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); + } + + /** Try and lock the mutex. + @return true on success */ + bool try_lock() UNIV_NOTHROW + { + uint32_t oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + /** Release the mutex. */ + void exit() UNIV_NOTHROW + { + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_LOCKED); + m_lock_word.store(MUTEX_STATE_UNLOCKED, + std::memory_order_release); + } + + /** Acquire the mutex. + @param max_spins max number of spins + @param max_delay max delay per spin */ + void enter(uint32_t max_spins, uint32_t max_delay, + const char*, uint32_t) UNIV_NOTHROW + { + const uint32_t step = max_spins; + uint32_t n_spins = 0; + + while (!try_lock()) { + ut_delay(max_delay); + if (++n_spins == max_spins) { + os_thread_yield(); + max_spins+= step; + } + } + + m_policy.add(n_spins, 0); + } + + /** @return non-const version of the policy */ + MutexPolicy& policy() UNIV_NOTHROW + { + return(m_policy); + } + + /** @return const version of the policy */ + const MutexPolicy& policy() const UNIV_NOTHROW + { + return(m_policy); + } + +private: + // Disable copying + TTASMutex(const TTASMutex&); + TTASMutex& operator=(const TTASMutex&); + + /** Policy data */ + MutexPolicy m_policy; + + /** mutex state */ + std::atomic<uint32_t> m_lock_word; +}; + +template <template <typename> class Policy> +struct TTASEventMutex { + + typedef Policy<TTASEventMutex> MutexPolicy; + + TTASEventMutex() + UNIV_NOTHROW + : + m_lock_word(MUTEX_STATE_UNLOCKED), + m_event() + { + /* Check that lock_word is aligned. */ + ut_ad(!((ulint) &m_lock_word % sizeof(ulint))); + } + + ~TTASEventMutex() + UNIV_NOTHROW + { + ut_ad(state() == MUTEX_STATE_UNLOCKED); + } + + /** Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. + @param[in] id Mutex ID */ + void init(latch_id_t id, const char*, uint32_t) UNIV_NOTHROW + { + ut_a(m_event == 0); + ut_ad(state() == MUTEX_STATE_UNLOCKED); + + m_event = os_event_create(sync_latch_get_name(id)); + } + + /** This is the real desctructor. This mutex can be created in BSS and + its desctructor will be called on exit(). We can't call + os_event_destroy() at that stage. */ + void destroy() + UNIV_NOTHROW + { + ut_ad(state() == MUTEX_STATE_UNLOCKED); + + /* We have to free the event before InnoDB shuts down. */ + os_event_destroy(m_event); + m_event = 0; + } + + /** Try and lock the mutex. Note: POSIX returns 0 on success. + @return true on success */ + bool try_lock() + UNIV_NOTHROW + { + uint32_t oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + /** Release the mutex. */ + void exit() + UNIV_NOTHROW + { + if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED, + std::memory_order_release) + == MUTEX_STATE_WAITERS) { + os_event_set(m_event); + sync_array_object_signalled(); + } + } + + /** Acquire the mutex. + @param[in] max_spins max number of spins + @param[in] max_delay max delay per spin + @param[in] filename from where called + @param[in] line within filename */ + void enter( + uint32_t max_spins, + uint32_t max_delay, + const char* filename, + uint32_t line) + UNIV_NOTHROW + { + uint32_t n_spins = 0; + uint32_t n_waits = 0; + const uint32_t step = max_spins; + + while (!try_lock()) { + if (n_spins++ == max_spins) { + max_spins += step; + n_waits++; + os_thread_yield(); + + sync_cell_t* cell; + sync_array_t *sync_arr = sync_array_get_and_reserve_cell( + this, SYNC_MUTEX, + filename, line, &cell); + + uint32_t oldval = MUTEX_STATE_LOCKED; + m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_WAITERS, + std::memory_order_relaxed, + std::memory_order_relaxed); + + if (oldval == MUTEX_STATE_UNLOCKED) { + sync_array_free_cell(sync_arr, cell); + } else { + sync_array_wait_event(sync_arr, cell); + } + } else { + ut_delay(max_delay); + } + } + + m_policy.add(n_spins, n_waits); + } + + /** @return the lock state. */ + int32 state() const + UNIV_NOTHROW + { + return m_lock_word.load(std::memory_order_relaxed); + } + + /** The event that the mutex will wait in sync0arr.cc + @return even instance */ + os_event_t event() + UNIV_NOTHROW + { + return(m_event); + } + + /** @return non-const version of the policy */ + MutexPolicy& policy() + UNIV_NOTHROW + { + return(m_policy); + } + + /** @return const version of the policy */ + const MutexPolicy& policy() const + UNIV_NOTHROW + { + return(m_policy); + } + +private: + /** Disable copying */ + TTASEventMutex(const TTASEventMutex&); + TTASEventMutex& operator=(const TTASEventMutex&); + + /** mutex state */ + std::atomic<uint32_t> m_lock_word; + + /** Used by sync0arr.cc for the wait queue */ + os_event_t m_event; + + /** Policy data */ + MutexPolicy m_policy; +}; + +/** Mutex interface for all policy mutexes. This class handles the interfacing +with the Performance Schema instrumentation. */ +template <typename MutexImpl> +struct PolicyMutex +{ + typedef typename MutexImpl::MutexPolicy Policy; + + PolicyMutex() UNIV_NOTHROW : m_impl() + { +#ifdef UNIV_PFS_MUTEX + m_ptr = 0; +#endif /* UNIV_PFS_MUTEX */ + } + + ~PolicyMutex() { } + + /** @return non-const version of the policy */ + Policy& policy() UNIV_NOTHROW + { + return(m_impl.policy()); + } + + /** @return const version of the policy */ + const Policy& policy() const UNIV_NOTHROW + { + return(m_impl.policy()); + } + + /** Release the mutex. */ + void exit() UNIV_NOTHROW + { +#ifdef UNIV_PFS_MUTEX + pfs_exit(); +#endif /* UNIV_PFS_MUTEX */ + + ut_d(policy().context.release(m_impl)); + + m_impl.exit(); + } + + /** Acquire the mutex. + @param n_spins max number of spins + @param n_delay max delay per spin + @param name filename where locked + @param line line number where locked */ + void enter( + uint32_t n_spins, + uint32_t n_delay, + const char* name, + uint32_t line) UNIV_NOTHROW + { +#ifdef UNIV_PFS_MUTEX + /* Note: locker is really an alias for state. That's why + it has to be in the same scope during pfs_end(). */ + + PSI_mutex_locker_state state; + PSI_mutex_locker* locker; + + locker = pfs_begin_lock(&state, name, line); +#endif /* UNIV_PFS_MUTEX */ + + ut_d(policy().context.enter(m_impl, name, line)); + + m_impl.enter(n_spins, n_delay, name, line); + + ut_d(policy().context.locked(m_impl, name, line)); +#ifdef UNIV_PFS_MUTEX + pfs_end(locker, 0); +#endif /* UNIV_PFS_MUTEX */ + } + + /** Try and lock the mutex, return 0 on SUCCESS and 1 otherwise. + @param name filename where locked + @param line line number where locked */ + int trylock(const char* name, uint32_t line) UNIV_NOTHROW + { +#ifdef UNIV_PFS_MUTEX + /* Note: locker is really an alias for state. That's why + it has to be in the same scope during pfs_end(). */ + + PSI_mutex_locker_state state; + PSI_mutex_locker* locker; + + locker = pfs_begin_trylock(&state, name, line); +#endif /* UNIV_PFS_MUTEX */ + + /* There is a subtlety here, we check the mutex ordering + after locking here. This is only done to avoid add and + then remove if the trylock was unsuccesful. */ + + int ret = m_impl.try_lock() ? 0 : 1; + + if (ret == 0) { + + ut_d(policy().context.enter(m_impl, name, line)); + + ut_d(policy().context.locked(m_impl, name, line)); + } + +#ifdef UNIV_PFS_MUTEX + pfs_end(locker, 0); +#endif /* UNIV_PFS_MUTEX */ + + return(ret); + } + +#ifdef UNIV_DEBUG + /** @return true if the thread owns the mutex. */ + bool is_owned() const UNIV_NOTHROW + { + return(policy().context.is_owned()); + } +#endif /* UNIV_DEBUG */ + + /** + Initialise the mutex. + + @param[in] id Mutex ID + @param[in] filename file where created + @param[in] line line number in file where created */ + void init( + latch_id_t id, + const char* filename, + uint32_t line) + UNIV_NOTHROW + { +#ifdef UNIV_PFS_MUTEX + pfs_add(sync_latch_get_pfs_key(id)); +#endif /* UNIV_PFS_MUTEX */ + + m_impl.init(id, filename, line); + policy().init(m_impl, id, filename, line); + ut_d(policy().context.init(id)); + } + + /** Free resources (if any) */ + void destroy() UNIV_NOTHROW + { +#ifdef UNIV_PFS_MUTEX + pfs_del(); +#endif /* UNIV_PFS_MUTEX */ + m_impl.destroy(); + policy().destroy(); + ut_d(policy().context.destroy()); + } + + /** Required for os_event_t */ + operator sys_mutex_t*() UNIV_NOTHROW + { + return(m_impl.operator sys_mutex_t*()); + } + +#ifdef UNIV_PFS_MUTEX + /** Performance schema monitoring - register mutex with PFS. + + Note: This is public only because we want to get around an issue + with registering a subset of buffer pool pages with PFS when + PFS_GROUP_BUFFER_SYNC is defined. Therefore this has to then + be called by external code (see buf0buf.cc). + + @param key - Performance Schema key. */ + void pfs_add(mysql_pfs_key_t key) UNIV_NOTHROW + { + ut_ad(m_ptr == 0); + m_ptr = PSI_MUTEX_CALL(init_mutex)(key, this); + } + +private: + + /** Performance schema monitoring. + @param state - PFS locker state + @param name - file name where locked + @param line - line number in file where locked */ + PSI_mutex_locker* pfs_begin_lock( + PSI_mutex_locker_state* state, + const char* name, + uint32_t line) UNIV_NOTHROW + { + if (m_ptr != 0) { + return(PSI_MUTEX_CALL(start_mutex_wait)( + state, m_ptr, + PSI_MUTEX_LOCK, name, (uint) line)); + } + + return(0); + } + + /** Performance schema monitoring. + @param state - PFS locker state + @param name - file name where locked + @param line - line number in file where locked */ + PSI_mutex_locker* pfs_begin_trylock( + PSI_mutex_locker_state* state, + const char* name, + uint32_t line) UNIV_NOTHROW + { + if (m_ptr != 0) { + return(PSI_MUTEX_CALL(start_mutex_wait)( + state, m_ptr, + PSI_MUTEX_TRYLOCK, name, (uint) line)); + } + + return(0); + } + + /** Performance schema monitoring + @param locker - PFS identifier + @param ret - 0 for success and 1 for failure */ + void pfs_end(PSI_mutex_locker* locker, int ret) UNIV_NOTHROW + { + if (locker != 0) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, ret); + } + } + + /** Performance schema monitoring - register mutex release */ + void pfs_exit() + { + if (m_ptr != 0) { + PSI_MUTEX_CALL(unlock_mutex)(m_ptr); + } + } + + /** Performance schema monitoring - deregister */ + void pfs_del() + { + if (m_ptr != 0) { + PSI_MUTEX_CALL(destroy_mutex)(m_ptr); + m_ptr = 0; + } + } +#endif /* UNIV_PFS_MUTEX */ + +private: + /** The mutex implementation */ + MutexImpl m_impl; + +#ifdef UNIV_PFS_MUTEX + /** The performance schema instrumentation hook. */ + PSI_mutex* m_ptr; +#endif /* UNIV_PFS_MUTEX */ + +}; + +#endif /* ib0mutex_h */ + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h new file mode 100644 index 00000000..cb418e57 --- /dev/null +++ b/storage/innobase/include/ibuf0ibuf.h @@ -0,0 +1,411 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.h +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0ibuf_h +#define ibuf0ibuf_h + +#include "mtr0mtr.h" +#include "dict0mem.h" +#include "fsp0fsp.h" +#include "ibuf0types.h" + +/** Default value for maximum on-disk size of change buffer in terms +of percentage of the buffer pool. */ +#define CHANGE_BUFFER_DEFAULT_SIZE (25) + +/* Possible operations buffered in the insert/whatever buffer. See +ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */ +typedef enum { + IBUF_OP_INSERT = 0, + IBUF_OP_DELETE_MARK = 1, + IBUF_OP_DELETE = 2, + + /* Number of different operation types. */ + IBUF_OP_COUNT = 3 +} ibuf_op_t; + +/** Combinations of operations that can be buffered. +@see innodb_change_buffering_names */ +enum ibuf_use_t { + IBUF_USE_NONE = 0, + IBUF_USE_INSERT, /* insert */ + IBUF_USE_DELETE_MARK, /* delete */ + IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */ + IBUF_USE_DELETE, /* delete+purge */ + IBUF_USE_ALL /* insert+delete+purge */ +}; + +/** Operations that can currently be buffered. */ +extern ulong innodb_change_buffering; + +/** The insert buffer control structure */ +extern ibuf_t ibuf; + +/* The purpose of the insert buffer is to reduce random disk access. +When we wish to insert a record into a non-unique secondary index and +the B-tree leaf page where the record belongs to is not in the buffer +pool, we insert the record into the insert buffer B-tree, indexed by +(space_id, page_no). When the page is eventually read into the buffer +pool, we look up the insert buffer B-tree for any modifications to the +page, and apply these upon the completion of the read operation. This +is called the insert buffer merge. */ + +/* The insert buffer merge must always succeed. To guarantee this, +the insert buffer subsystem keeps track of the free space in pages for +which it can buffer operations. Two bits per page in the insert +buffer bitmap indicate the available space in coarse increments. The +free bits in the insert buffer bitmap must never exceed the free space +on a page. It is safe to decrement or reset the bits in the bitmap in +a mini-transaction that is committed before the mini-transaction that +affects the free space. It is unsafe to increment the bits in a +separately committed mini-transaction, because in crash recovery, the +free bits could momentarily be set too high. */ + +/******************************************************************//** +Creates the insert buffer data structure at a database startup. +@return DB_SUCCESS or failure */ +dberr_t +ibuf_init_at_db_start(void); +/*=======================*/ +/*********************************************************************//** +Updates the max_size value for ibuf. */ +void +ibuf_max_size_update( +/*=================*/ + ulint new_val); /*!< in: new value in terms of + percentage of the buffer pool size */ +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block); /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase);/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique); /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Checks if a page address is an ibuf bitmap page (level 3 page) address. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return TRUE if a bitmap page */ +inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size) +{ + ut_ad(ut_is_2pow(zip_size)); + ulint size = zip_size ? zip_size : srv_page_size; + return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET; +} + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] x_latch FALSE if relaxed check (avoid latching the +bitmap page) +@param[in] file file name +@param[in] line line where called +@param[in,out] mtr mtr which will contain an x-latch to the +bitmap page if the page is not one of the fixed address ibuf pages, or NULL, +in which case a new transaction is created. +@return true if level 2 or level 3 page */ +bool +ibuf_page_low( + const page_id_t page_id, + ulint zip_size, +#ifdef UNIV_DEBUG + bool x_latch, +#endif /* UNIV_DEBUG */ + const char* file, + unsigned line, + mtr_t* mtr) + MY_ATTRIBUTE((warn_unused_result)); + +#ifdef UNIV_DEBUG + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id tablespace/page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, true, __FILE__, __LINE__, mtr) + +#else /* UVIV_DEBUG */ + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id tablespace/page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, __FILE__, __LINE__, mtr) + +#endif /* UVIV_DEBUG */ +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +void +ibuf_free_excess_pages(void); +/*========================*/ + +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index +is clustered or unique. +@param[in] op operation type +@param[in] entry index entry to insert +@param[in,out] index index where to insert +@param[in] page_id page id where to insert +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] thr query thread +@return true if success */ +bool +ibuf_insert( + ibuf_op_t op, + const dtuple_t* entry, + dict_index_t* index, + const page_id_t page_id, + ulint zip_size, + que_thr_t* thr); + +/** Check whether buffered changes exist for a page. +@param[in] id page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return whether buffered changes exist */ +bool ibuf_page_exists(const page_id_t id, ulint zip_size); + +/** When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. +@param block X-latched page to try to apply changes to, or NULL to discard +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ +void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id, + ulint zip_size); + +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(ulint space); + +/** Contract the change buffer by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +ulint ibuf_merge_all(); + +/** Contracts insert buffer trees by reading pages referring to space_id +to the buffer pool. +@returns number of pages merged.*/ +ulint +ibuf_merge_space( +/*=============*/ + ulint space); /*!< in: space id */ + +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +bool +ibuf_is_empty(void); +/*===============*/ +/******************************************************************//** +Prints info of ibuf. */ +void +ibuf_print( +/*=======*/ + FILE* file); /*!< in: file where to print */ +/******************************************************************** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */ +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec); /*!< in: ibuf record */ +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +void +ibuf_close(void); +/*============*/ + +/** Check the insert buffer bitmaps on IMPORT TABLESPACE. +@param[in] trx transaction +@param[in,out] space tablespace being imported +@return DB_SUCCESS or error code */ +dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Updates free bits and buffered bits for bulk loaded page. +@param[in] block index page +@param]in] reset flag if reset free val */ +void +ibuf_set_bitmap_for_bulk_load( + buf_block_t* block, + bool reset); + +#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO +#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO + +/* The ibuf header page currently contains only the file segment header +for the file segment from which the pages for the ibuf tree are allocated */ +#define IBUF_HEADER PAGE_DATA +#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */ + +/* The insert buffer tree itself is always located in space 0. */ +#define IBUF_SPACE_ID static_cast<ulint>(0) + +#include "ibuf0ibuf.ic" + +#endif diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic new file mode 100644 index 00000000..2c262051 --- /dev/null +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -0,0 +1,307 @@ +/***************************************************************************** + +Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.ic +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "page0page.h" +#include "page0zip.h" +#include "fsp0types.h" +#include "buf0lru.h" + +/** An index page must contain at least srv_page_size / +IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to +buffer inserts to this page. If there is this much of free space, the +corresponding bits are set in the ibuf bitmap. */ +#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32 + +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ +{ + mtr_start(mtr); + mtr->enter_ibuf(); + + if (high_level_read_only || srv_read_only_mode) { + mtr_set_log_mode(mtr, MTR_LOG_NO_REDO); + } + +} +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->is_inside_ibuf()); + ut_d(mtr->exit_ibuf()); + + mtr_commit(mtr); +} + +/** Insert buffer struct */ +struct ibuf_t{ + ulint size; /*!< current size of the ibuf index + tree, in pages */ + ulint max_size; /*!< recommended maximum size of the + ibuf index tree, in pages */ + ulint seg_size; /*!< allocated pages of the file + segment containing ibuf header and + tree */ + bool empty; /*!< Protected by the page + latch of the root page of the + insert buffer tree + (FSP_IBUF_TREE_ROOT_PAGE_NO). true + if and only if the insert + buffer tree is empty. */ + ulint free_list_len; /*!< length of the free list */ + ulint height; /*!< tree height */ + dict_index_t* index; /*!< insert buffer index */ + + /** number of pages merged */ + Atomic_counter<ulint> n_merges; + Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + merged to index pages */ + Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + discarded without merging due to the + tablespace being deleted or the + index being dropped */ +}; + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val); /*!< in: value to set: < 4 */ +#ifdef UNIV_IBUF_DEBUG +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v) +#else /* UNIV_IBUF_DEBUG */ +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v) +#endif /* UNIV_IBUF_DEBUG */ + +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique) /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +{ + return(innodb_change_buffering + && ibuf.max_size != 0 + && !dict_index_is_clust(index) + && !dict_index_is_spatial(index) + && index->table->quiesce == QUIESCE_NONE + && (ignore_sec_unique || !dict_index_is_unique(index))); +} + +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ +{ + return(mtr->is_inside_ibuf()); +} + +/** Translates the free space on a page to a value in the ibuf bitmap. +@param[in] page_size page size in bytes +@param[in] max_ins_size maximum insert size after reorganize for +the page +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_bits( + ulint page_size, + ulint max_ins_size) +{ + ulint n; + ut_ad(ut_is_2pow(page_size)); + ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + + n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + + if (n == 3) { + n = 2; + } + + if (n > 3) { + n = 3; + } + + return(n); +} + +/*********************************************************************//** +Translates the free space on a compressed page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_zip( +/*==========================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + ulint max_ins_size; + const page_zip_des_t* page_zip; + lint zip_max_ins; + + ut_ad(block->page.zip.data); + + /* Consider the maximum insert size on the uncompressed page + without reorganizing the page. We must not assume anything + about the compression ratio. If zip_max_ins > max_ins_size and + there is 1/4 garbage on the page, recompression after the + reorganize could fail, in theory. So, let us guarantee that + merging a buffered insert to a compressed page will always + succeed without reorganizing or recompressing the page, just + by using the page modification log. */ + max_ins_size = page_get_max_insert_size( + buf_block_get_frame(block), 1); + + page_zip = buf_block_get_page_zip(block); + zip_max_ins = page_zip_max_ins_size(page_zip, + FALSE/* not clustered */); + + if (zip_max_ins < 0) { + return(0); + } else if (max_ins_size > (ulint) zip_max_ins) { + max_ins_size = (ulint) zip_max_ins; + } + + return(ibuf_index_page_calc_free_bits(block->physical_size(), + max_ins_size)); +} + +/*********************************************************************//** +Translates the free space on a page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free( +/*======================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + if (!block->page.zip.data) { + ulint max_ins_size; + + max_ins_size = page_get_max_insert_size_after_reorganize( + buf_block_get_frame(block), 1); + + return(ibuf_index_page_calc_free_bits( + block->physical_size(), max_ins_size)); + } else { + return(ibuf_index_page_calc_free_zip(block)); + } +} + +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase)/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +{ + ulint before; + ulint after; + + ut_ad(buf_block_get_page_zip(block) == NULL); + + before = ibuf_index_page_calc_free_bits( + srv_page_size, max_ins_size); + + if (max_ins_size >= increase) { + compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX); + after = ibuf_index_page_calc_free_bits( + srv_page_size, max_ins_size - increase); +#ifdef UNIV_IBUF_DEBUG + ut_a(after <= ibuf_index_page_calc_free(block)); +#endif + } else { + after = ibuf_index_page_calc_free(block); + } + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (before > after) { + ibuf_set_free_bits(block, after, before); + } +} diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h new file mode 100644 index 00000000..6b7c4720 --- /dev/null +++ b/storage/innobase/include/ibuf0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0types.h +Insert buffer global types + +Created 7/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0types_h +#define ibuf0types_h + +struct ibuf_t; + +#endif diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h new file mode 100644 index 00000000..a7e61395 --- /dev/null +++ b/storage/innobase/include/lock0iter.h @@ -0,0 +1,66 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0iter.h +Lock queue iterator type and function prototypes. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0iter_h +#define lock0iter_h + +#include "lock0types.h" + +struct lock_queue_iterator_t { + const lock_t* current_lock; + /* In case this is a record lock queue (not table lock queue) + then bit_no is the record number within the heap in which the + record is stored. */ + ulint bit_no; +}; + +/*******************************************************************//** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /*!< out: iterator */ + const lock_t* lock, /*!< in: lock to start from */ + ulint bit_no);/*!< in: record number in the + heap */ + +/*******************************************************************//** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). +@return previous lock or NULL */ +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + lock_queue_iterator_t* iter); /*!< in/out: iterator */ + +#endif /* lock0iter_h */ diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h new file mode 100644 index 00000000..3b63b06a --- /dev/null +++ b/storage/innobase/include/lock0lock.h @@ -0,0 +1,990 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.h +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0lock_h +#define lock0lock_h + +#include "buf0types.h" +#include "trx0types.h" +#include "mtr0types.h" +#include "rem0types.h" +#include "que0types.h" +#include "lock0types.h" +#include "hash0hash.h" +#include "srv0srv.h" +#include "ut0vec.h" +#include "gis0rtree.h" +#include "lock0prdt.h" + +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + /*!< Variance-Aware-Transaction-Scheduling */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS +}; + +extern ulong innodb_lock_schedule_algorithm; + +// Forward declaration +class ReadView; + +/** The value of innodb_deadlock_detect */ +extern my_bool innobase_deadlock_detect; + +/*********************************************************************//** +Gets the size of a lock struct. +@return size in bytes */ +ulint +lock_get_size(void); +/*===============*/ +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block); /*!< in: buffer block */ +/*************************************************************//** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /*!< in: old index page, now + reorganized */ + const buf_block_t* oblock);/*!< in: copy of the old, not + reorganized page */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec); /*!< in: record on page: this + is the first record moved */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec, /*!< in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end); /*!< in: old + previous-to-last + record on new_page + before the records + were copied */ +/*************************************************************//** +Updates the lock table when a page is split to the right. */ +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the right. */ +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page to + which merged */ + const rec_t* orig_succ, /*!< in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block); /*!< in: merged index + page which will be + discarded */ +/*************************************************************//** +Updates the lock table when the root page is copied to another in +btr_root_raise_and_insert. Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +void +lock_update_root_raise( +/*===================*/ + const buf_block_t* block, /*!< in: index page to which copied */ + const buf_block_t* root); /*!< in: root page */ +/*************************************************************//** +Updates the lock table when a page is copied to another and the original page +is removed from the chain of leaf pages, except if page is the root! */ +void +lock_update_copy_and_discard( +/*=========================*/ + const buf_block_t* new_block, /*!< in: index page to + which copied */ + const buf_block_t* block); /*!< in: index page; + NOT the root! */ +/*************************************************************//** +Updates the lock table when a page is split to the left. */ +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the left. */ +void +lock_update_merge_left( +/*===================*/ + const buf_block_t* left_block, /*!< in: left page to + which merged */ + const rec_t* orig_pred, /*!< in: original predecessor + of supremum on the left page + before merge */ + const buf_block_t* right_block); /*!< in: merged index page + which will be discarded */ +/*************************************************************//** +Updates the lock table when a page is split and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block);/*!< in: right page from which merged */ +/*************************************************************//** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t* heir_block, /*!< in: block containing the + record which inherits */ + const buf_block_t* block, /*!< in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no); /*!< in: heap_no of the + donating record */ +/*************************************************************//** +Updates the lock table when a page is discarded. */ +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /*!< in: index page + which will inherit the locks */ + ulint heir_heap_no, /*!< in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block); /*!< in: index page + which will be discarded */ +/*************************************************************//** +Updates the lock table when a new user record is inserted. */ +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the inserted record */ +/*************************************************************//** +Updates the lock table when a record is removed. */ +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the record to be removed */ +/*********************************************************************//** +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is in such an update moved, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +/*********************************************************************//** +Restores the state of explicit lock requests on a single record, where the +state was stored on the infimum of the page. */ +void +lock_rec_restore_from_page_infimum( +/*===============================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record whose lock state + is restored */ + const buf_block_t* donator);/*!< in: page (rec is not + necessarily on this page) + whose infimum stored the lock + state; lock bits are reset on + the infimum */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_rec_insert_check_and_lock( +/*===========================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + bool* inherit)/*!< out: set to true if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify +(delete mark or delete unmark) of a secondary index record. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Like lock_clust_rec_read_check_and_lock(), but reads a +secondary index record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_sec_rec_read_check_and_lock( +/*=============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: secondary index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_read_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Checks that a record is seen in a consistent read. +@return true if sees, or false if an earlier version of the record +should be retrieved */ +bool +lock_clust_rec_cons_read_sees( +/*==========================*/ + const rec_t* rec, /*!< in: user record which should be read or + passed over by a read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + ReadView* view); /*!< in: consistent read view */ +/*********************************************************************//** +Checks that a non-clustered index record is seen in a consistent read. + +NOTE that a non-clustered index page contains so little information on +its modifications that also in the case false, the present version of +rec may be the right, but we must check this from the clustered index +record. + +@return true if certainly sees, or false if an earlier version of the +clustered index record might be needed */ +bool +lock_sec_rec_cons_read_sees( +/*========================*/ + const rec_t* rec, /*!< in: user record which + should be read or passed over + by a read cursor */ + const dict_index_t* index, /*!< in: index */ + const ReadView* view) /*!< in: consistent read view */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Locks the specified database table in the mode given. If the lock cannot +be granted immediately, the query thread is put to wait. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_table( +/*=======*/ + unsigned flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + dict_table_t* table, /*!< in/out: database table + in dictionary cache */ + lock_mode mode, /*!< in: lock mode */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Creates a table IX lock object for a resurrected transaction. */ +void +lock_table_ix_resurrect( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx); /*!< in/out: transaction */ + +/** Sets a lock on a table based on the given mode. +@param[in] table table to lock +@param[in,out] trx transaction +@param[in] mode LOCK_X or LOCK_S +@return error code or DB_SUCCESS. */ +dberr_t +lock_table_for_trx( + dict_table_t* table, + trx_t* trx, + enum lock_mode mode) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*************************************************************//** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /*!< in/out: transaction that has + set a record lock */ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record */ + lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */ + +/** Release the explicit locks of a committing transaction, +and release possible other transactions waiting because of these locks. */ +void lock_release(trx_t* trx); + +/*************************************************************//** +Get the lock hash table */ +UNIV_INLINE +hash_table_t* +lock_hash_get( +/*==========*/ + ulint mode); /*!< in: lock mode */ + +/**********************************************************************//** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. +@return bit index == heap number of the record, or ULINT_UNDEFINED if +none found */ +ulint +lock_rec_find_set_bit( +/*==================*/ + const lock_t* lock); /*!< in: record lock with at least one + bit set */ + +/*********************************************************************//** +Checks if a lock request lock1 has to wait for request lock2. +@return whether lock1 has to wait for lock2 to be removed */ +bool +lock_has_to_wait( +/*=============*/ + const lock_t* lock1, /*!< in: waiting lock */ + const lock_t* lock2); /*!< in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +/*********************************************************************//** +Reports that a transaction id is insensible, i.e., in the future. */ +ATTRIBUTE_COLD +void +lock_report_trx_id_insanity( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */ +/*********************************************************************//** +Prints info of locks for all transactions. +@return FALSE if not able to obtain lock mutex and exits without +printing info */ +ibool +lock_print_info_summary( +/*====================*/ + FILE* file, /*!< in: file where to print */ + ibool nowait) /*!< in: whether to wait for the lock mutex */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Prints transaction lock wait and MVCC state. +@param[in,out] file file where to print +@param[in] trx transaction +@param[in] now current time */ +void +lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now); + +/*********************************************************************//** +Prints info of locks for each transaction. This function assumes that the +caller holds the lock mutex and more importantly it will release the lock +mutex on behalf of the caller. (This should be fixed in the future). */ +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file); /*!< in: file where to print */ +/*********************************************************************//** +Return approximate number or record locks (bits set in the bitmap) for +this transaction. Since delete-marked records may be removed, the +record count will not be precise. +The caller must be holding lock_sys.mutex. */ +ulint +lock_number_of_rows_locked( +/*=======================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Return the number of table locks for a transaction. +The caller must be holding lock_sys.mutex. */ +ulint +lock_number_of_tables_locked( +/*=========================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ + MY_ATTRIBUTE((warn_unused_result)); + +/*******************************************************************//** +Gets the type of a lock. Non-inline version for using outside of the +lock module. +@return LOCK_TABLE or LOCK_REC */ +ulint +lock_get_type( +/*==========*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the id of the table on which the lock is. +@return id of the table */ +table_id_t +lock_get_table_id( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/** Determine which table a lock is associated with. +@param[in] lock the lock +@return name of the table */ +const table_name_t& +lock_get_table_name( + const lock_t* lock); + +/*******************************************************************//** +For a record lock, gets the index on which the lock is. +@return index */ +const dict_index_t* +lock_rec_get_index( +/*===============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the name of the index on which the lock is. +The string should not be free()'d or modified. +@return name of the index */ +const char* +lock_rec_get_index_name( +/*====================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Check if there are any locks (table or rec) against table. +@return TRUE if locks exist */ +bool +lock_table_has_locks( +/*=================*/ + const dict_table_t* table); /*!< in: check if there are any locks + held on records in this table or on the + table itself */ + +/** A task which wakes up threads whose lock wait may have lasted too long */ +void lock_wait_timeout_task(void*); + +/********************************************************************//** +Releases a user OS thread waiting for a lock to be released, if the +thread is already suspended. */ +void +lock_wait_release_thread_if_suspended( +/*==================================*/ + que_thr_t* thr); /*!< in: query thread associated with the + user OS thread */ + +/***************************************************************//** +Puts a user OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ +void +lock_wait_suspend_thread( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread associated with the + user OS thread */ +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. This +function should be called at the the end of an SQL statement, by the +connection thread that owns the transaction (trx->mysql_thd). */ +void +lock_unlock_table_autoinc( +/*======================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Check whether the transaction has already been rolled back because it +was selected as a deadlock victim, or if it has to wait then cancel +the wait lock. +@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */ +dberr_t +lock_trx_handle_wait( +/*=================*/ + trx_t* trx); /*!< in/out: trx lock state */ +/*********************************************************************//** +Get the number of locks on a table. +@return number of locks */ +ulint +lock_table_get_n_locks( +/*===================*/ + const dict_table_t* table); /*!< in: table */ +/*******************************************************************//** +Initialise the trx lock list. */ +void +lock_trx_lock_list_init( +/*====================*/ + trx_lock_list_t* lock_list); /*!< List to initialise */ + +/*********************************************************************//** +Checks that a transaction id is sensible, i.e., not in the future. +@return true if ok */ +bool +lock_check_trx_id_sanity( +/*=====================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets); /*!< in: rec_get_offsets(rec, index) */ +#ifdef UNIV_DEBUG +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check if the transaction holds an explicit exclusive lock on a record. +@param[in] trx transaction +@param[in] table table +@param[in] block leaf page +@param[in] heap_no heap number identifying the record +@return whether an explicit X-lock is held */ +bool +lock_trx_has_expl_x_lock( + const trx_t* trx, /*!< in: transaction to check */ + const dict_table_t* table, /*!< in: table to check */ + const buf_block_t* block, /*!< in: buffer block of the record */ + ulint heap_no)/*!< in: record heap number */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/** Lock operation struct */ +struct lock_op_t{ + dict_table_t* table; /*!< table to be locked */ + lock_mode mode; /*!< lock mode */ +}; + +typedef ib_mutex_t LockMutex; + +/** The lock system struct */ +class lock_sys_t +{ + bool m_initialised; + +public: + MY_ALIGNED(CACHE_LINE_SIZE) + LockMutex mutex; /*!< Mutex protecting the + locks */ + /** record locks */ + hash_table_t rec_hash; + /** predicate locks for SPATIAL INDEX */ + hash_table_t prdt_hash; + /** page locks for SPATIAL INDEX */ + hash_table_t prdt_page_hash; + + MY_ALIGNED(CACHE_LINE_SIZE) + LockMutex wait_mutex; /*!< Mutex protecting the + next two fields */ + srv_slot_t* waiting_threads; /*!< Array of user threads + suspended while waiting for + locks within InnoDB, protected + by the lock_sys.wait_mutex; + os_event_set() and + os_event_reset() on + waiting_threads[]->event + are protected by + trx_t::mutex */ + srv_slot_t* last_slot; /*!< highest slot ever used + in the waiting_threads array, + protected by + lock_sys.wait_mutex */ + + ulint n_lock_max_wait_time; /*!< Max wait time */ + + std::unique_ptr<tpool::timer> timeout_timer; /*!< Thread pool timer task */ + bool timeout_timer_active; + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + lock_sys_t(): m_initialised(false) {} + + + bool is_initialised() { return m_initialised; } + + + /** + Creates the lock system at database start. + + @param[in] n_cells number of slots in lock hash table + */ + void create(ulint n_cells); + + + /** + Resize the lock hash table. + + @param[in] n_cells number of slots in lock hash table + */ + void resize(ulint n_cells); + + + /** Closes the lock system at database shutdown. */ + void close(); + + /** @return the hash value for a page address */ + ulint hash(const page_id_t id) const + { ut_ad(mutex_own(&mutex)); return rec_hash.calc_hash(id.fold()); } + + /** Get the first lock on a page. + @param lock_hash hash table to look at + @param id page number + @return first lock + @retval nullptr if none exists */ + lock_t *get_first(const hash_table_t &lock_hash, const page_id_t id) const + { + ut_ad(&lock_hash == &rec_hash || &lock_hash == &prdt_hash || + &lock_hash == &prdt_page_hash); + for (lock_t *lock= static_cast<lock_t*> + (HASH_GET_FIRST(&lock_hash, hash(id))); + lock; lock= static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) + if (lock->un_member.rec_lock.page_id == id) + return lock; + return nullptr; + } + + /** Get the first record lock on a page. + @param id page number + @return first lock + @retval nullptr if none exists */ + lock_t *get_first(const page_id_t id) const + { return get_first(rec_hash, id); } + /** Get the first predicate lock on a SPATIAL INDEX page. + @param id page number + @return first lock + @retval nullptr if none exists */ + lock_t *get_first_prdt(const page_id_t id) const + { return get_first(prdt_hash, id); } + /** Get the first predicate lock on a SPATIAL INDEX page. + @param id page number + @return first lock + @retval nullptr if none exists */ + lock_t *get_first_prdt_page(const page_id_t id) const + { return get_first(prdt_page_hash, id); } +}; + +/*********************************************************************//** +Creates a new record lock and inserts it to the lock queue. Does NOT check +for deadlocks or lock compatibility! +@return created lock */ +UNIV_INLINE +lock_t* +lock_rec_create( +/*============*/ +#ifdef WITH_WSREP + lock_t* c_lock, /*!< conflicting lock */ + que_thr_t* thr, /*!< thread owning trx */ +#endif + unsigned type_mode,/*!< in: lock mode and wait + flag, type is ignored and + replaced by LOCK_REC */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in,out: transaction */ + bool caller_owns_trx_mutex); + /*!< in: true if caller owns + trx mutex */ + +/*************************************************************//** +Removes a record lock request, waiting or granted, from the queue. */ +void +lock_rec_discard( +/*=============*/ + lock_t* in_lock); /*!< in: record lock object: all + record locks which are contained + in this lock object are removed */ + +/** Create a new record lock and inserts it to the lock queue, +without checking for deadlocks or conflicts. +@param[in] type_mode lock mode and wait flag; type will be replaced + with LOCK_REC +@param[in] page_id index page number +@param[in] page R-tree index page, or NULL +@param[in] heap_no record heap number in the index page +@param[in] index the index tree +@param[in,out] trx transaction +@param[in] holds_trx_mutex whether the caller holds trx->mutex +@return created lock */ +lock_t* +lock_rec_create_low( +#ifdef WITH_WSREP + lock_t* c_lock, /*!< conflicting lock */ + que_thr_t* thr, /*!< thread owning trx */ +#endif + unsigned type_mode, + const page_id_t page_id, + const page_t* page, + ulint heap_no, + dict_index_t* index, + trx_t* trx, + bool holds_trx_mutex); +/** Enqueue a waiting request for a lock which cannot be granted immediately. +Check for deadlocks. +@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X) + possibly ORed with LOCK_GAP or + LOCK_REC_NOT_GAP, ORed with + LOCK_INSERT_INTENTION if this + waiting lock request is set + when performing an insert of + an index record +@param[in] block leaf page in the index +@param[in] heap_no record heap number in the block +@param[in] index index tree +@param[in,out] thr query thread +@param[in] prdt minimum bounding box (spatial index) +@retval DB_LOCK_WAIT if the waiting lock was enqueued +@retval DB_DEADLOCK if this transaction was chosen as the victim +@retval DB_SUCCESS_LOCKED_REC if the other transaction was chosen as a victim + (or it happened to commit) */ +dberr_t +lock_rec_enqueue_waiting( +#ifdef WITH_WSREP + lock_t* c_lock, /*!< conflicting lock */ +#endif + unsigned type_mode, + const buf_block_t* block, + ulint heap_no, + dict_index_t* index, + que_thr_t* thr, + lock_prdt_t* prdt); +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +void +lock_rtr_move_rec_list( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to + move to */ + const buf_block_t* block, /*!< in: index page */ + rtr_rec_move_t* rec_move, /*!< in: recording records + moved */ + ulint num_move); /*!< in: num of rec to move */ + +/*************************************************************//** +Removes record lock objects set on an index page which is discarded. This +function does not move locks, or check for waiting locks, therefore the +lock bitmaps must already be reset when this function is called. */ +void +lock_rec_free_all_from_discard_page( +/*================================*/ + const buf_block_t* block); /*!< in: page to be discarded */ + +/** The lock system */ +extern lock_sys_t lock_sys; + +/** Test if lock_sys.mutex can be acquired without waiting. */ +#define lock_mutex_enter_nowait() \ + (lock_sys.mutex.trylock(__FILE__, __LINE__)) + +/** Test if lock_sys.mutex is owned. */ +#define lock_mutex_own() (lock_sys.mutex.is_owned()) + +/** Acquire the lock_sys.mutex. */ +#define lock_mutex_enter() do { \ + mutex_enter(&lock_sys.mutex); \ +} while (0) + +/** Release the lock_sys.mutex. */ +#define lock_mutex_exit() do { \ + lock_sys.mutex.exit(); \ +} while (0) + +/** Test if lock_sys.wait_mutex is owned. */ +#define lock_wait_mutex_own() (lock_sys.wait_mutex.is_owned()) + +/** Acquire the lock_sys.wait_mutex. */ +#define lock_wait_mutex_enter() do { \ + mutex_enter(&lock_sys.wait_mutex); \ +} while (0) + +/** Release the lock_sys.wait_mutex. */ +#define lock_wait_mutex_exit() do { \ + lock_sys.wait_mutex.exit(); \ +} while (0) + +#ifdef WITH_WSREP +/*********************************************************************//** +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock); /*!< in/out: waiting lock request */ + +/*******************************************************************//** +Get lock mode and table/index name +@return string containing lock info */ +std::string +lock_get_info( + const lock_t*); + +#endif /* WITH_WSREP */ + +#include "lock0lock.ic" + +#endif diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic new file mode 100644 index 00000000..2d5b6ff3 --- /dev/null +++ b/storage/innobase/include/lock0lock.ic @@ -0,0 +1,103 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.ic +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "buf0buf.h" +#include "page0page.h" + +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + const page_t* page = block->frame; + + if (page_is_comp(page)) { + return(rec_get_heap_no_new( + page + + rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE))); + } else { + return(rec_get_heap_no_old( + page + + rec_get_next_offs(page + PAGE_OLD_INFIMUM, + FALSE))); + } +} + +/*************************************************************//** +Get the lock hash table */ +UNIV_INLINE +hash_table_t* +lock_hash_get( +/*==========*/ + ulint mode) /*!< in: lock mode */ +{ + if (mode & LOCK_PREDICATE) { + return &lock_sys.prdt_hash; + } else if (mode & LOCK_PRDT_PAGE) { + return &lock_sys.prdt_page_hash; + } else { + return &lock_sys.rec_hash; + } +} + +/*********************************************************************//** +Creates a new record lock and inserts it to the lock queue. Does NOT check +for deadlocks or lock compatibility! +@return created lock */ +UNIV_INLINE +lock_t* +lock_rec_create( +/*============*/ +#ifdef WITH_WSREP + lock_t* c_lock, /*!< conflicting lock */ + que_thr_t* thr, /*!< thread owning trx */ +#endif + unsigned type_mode,/*!< in: lock mode and wait + flag, type is ignored and + replaced by LOCK_REC */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in,out: transaction */ + bool caller_owns_trx_mutex) + /*!< in: TRUE if caller owns + trx mutex */ +{ + btr_assert_not_corrupted(block, index); + return lock_rec_create_low( +#ifdef WITH_WSREP + c_lock, thr, +#endif + type_mode, block->page.id(), block->frame, heap_no, + index, trx, caller_owns_trx_mutex); +} diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h new file mode 100644 index 00000000..43d68996 --- /dev/null +++ b/storage/innobase/include/lock0prdt.h @@ -0,0 +1,204 @@ +/***************************************************************************** + +Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0prdt.h +The predicate lock system + +Created 9/7/2013 Jimmy Yang +*******************************************************/ +#ifndef lock0prdt_h +#define lock0prdt_h + +#include "lock0lock.h" + +/* Predicate lock data */ +typedef struct lock_prdt { + void* data; /* Predicate data */ + uint16 op; /* Predicate operator */ +} lock_prdt_t; + +/*********************************************************************//** +Acquire a predicate lock on a block +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_prdt_lock( +/*===========*/ + buf_block_t* block, /*!< in/out: buffer block of rec */ + lock_prdt_t* prdt, /*!< in: Predicate for the lock */ + dict_index_t* index, /*!< in: secondary index */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned type_mode, + /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */ + que_thr_t* thr); /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + +/*********************************************************************//** +Acquire a "Page" lock on a block +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_place_prdt_page_lock( + const page_id_t page_id, /*!< in: page identifier */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr); /*!< in: query thread */ + +/*********************************************************************//** +Initiate a Predicate lock from a MBR */ +void +lock_init_prdt_from_mbr( +/*====================*/ + lock_prdt_t* prdt, /*!< in/out: predicate to initialized */ + rtr_mbr_t* mbr, /*!< in: Minimum Bounding Rectangle */ + ulint mode, /*!< in: Search mode */ + mem_heap_t* heap); /*!< in: heap for allocating memory */ + +/*********************************************************************//** +Get predicate lock's minimum bounding box +@return the minimum bounding box*/ +lock_prdt_t* +lock_get_prdt_from_lock( +/*====================*/ + const lock_t* lock); /*!< in: the lock */ + +/*********************************************************************//** +Checks if a predicate lock request for a new lock has to wait for +request lock2. +@return true if new lock has to wait for lock2 to be removed */ +bool +lock_prdt_has_to_wait( +/*==================*/ + const trx_t* trx, /*!< in: trx of new lock */ + unsigned type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE, + LOCK_INSERT_INTENTION */ + lock_prdt_t* prdt, /*!< in: lock predicate to check */ + const lock_t* lock2); /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + +/**************************************************************//** +Update predicate lock when page splits */ +void +lock_prdt_update_split( +/*===================*/ + buf_block_t* new_block, /*!< in/out: the new half page */ + lock_prdt_t* prdt, /*!< in: MBR on the old page */ + lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ + const page_id_t page_id); /*!< in: page number */ + +/**************************************************************//** +Ajust locks from an ancester page of Rtree on the appropriate level . */ +void +lock_prdt_update_parent( +/*====================*/ + buf_block_t* left_block, /*!< in/out: page to be split */ + buf_block_t* right_block, /*!< in/out: the new half page */ + lock_prdt_t* left_prdt, /*!< in: MBR on the old page */ + lock_prdt_t* right_prdt, /*!< in: MBR on the new page */ + const page_id_t page_id); /*!< in: parent page */ + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a predicate record. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_prdt_insert_check_and_lock( +/*============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + lock_prdt_t* prdt); /*!< in: Minimum Bound Rectangle */ + +/*********************************************************************//** +Append a predicate to the lock */ +void +lock_prdt_set_prdt( +/*===============*/ + lock_t* lock, /*!< in: lock */ + const lock_prdt_t* prdt); /*!< in: Predicate */ + +#if 0 + +/*********************************************************************//** +Checks if a predicate lock request for a new lock has to wait for +request lock2. +@return true if new lock has to wait for lock2 to be removed */ +UNIV_INLINE +bool +lock_prdt_has_to_wait( +/*==================*/ + const trx_t* trx, /*!< in: trx of new lock */ + unsigned type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE, + LOCK_INSERT_INTENTION */ + lock_prdt_t* prdt, /*!< in: lock predicate to check */ + const lock_t* lock2); /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + +/*********************************************************************//** +Get predicate lock's minimum bounding box +@return the minimum bounding box*/ +UNIV_INLINE +rtr_mbr_t* +prdt_get_mbr_from_prdt( +/*===================*/ + const lock_prdt_t* prdt); /*!< in: the lock predicate */ + + +#endif +/*************************************************************//** +Moves the locks of a record to another record and resets the lock bits of +the donating record. */ +void +lock_prdt_rec_move( +/*===============*/ + const buf_block_t* receiver, /*!< in: buffer block containing + the receiving record */ + const buf_block_t* donator); /*!< in: buffer block containing + the donating record */ + +/** Check whether there are R-tree Page lock on a page +@param[in] trx trx to test the lock +@param[in] page_id page identifier +@return true if there is none */ +bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id); + +/** Removes predicate lock objects set on an index page which is discarded. +@param[in] block page to be discarded +@param[in] lock_hash lock hash */ +void +lock_prdt_page_free_from_discard( +/*=============================*/ + const buf_block_t* block, + hash_table_t* lock_hash); + +#endif diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h new file mode 100644 index 00000000..1b2f9d0f --- /dev/null +++ b/storage/innobase/include/lock0priv.h @@ -0,0 +1,653 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.h +Lock module internal structures and methods. + +Created July 12, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0priv_h +#define lock0priv_h + +#ifndef LOCK_MODULE_IMPLEMENTATION +/* If you need to access members of the structures defined in this +file, please write appropriate functions that retrieve them and put +those functions in lock/ */ +#error Do not include lock0priv.h outside of the lock/ module +#endif + +#include "hash0hash.h" +#include "rem0types.h" +#include "trx0trx.h" + +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +/** Print the table lock into the given output stream +@param[in,out] out the output stream +@return the given output stream. */ +inline +std::ostream& lock_table_t::print(std::ostream& out) const +{ + out << "[lock_table_t: name=" << table->name << "]"; + return(out); +} + +/** The global output operator is overloaded to conveniently +print the lock_table_t object into the given output stream. +@param[in,out] out the output stream +@param[in] lock the table lock +@return the given output stream */ +inline +std::ostream& +operator<<(std::ostream& out, const lock_table_t& lock) +{ + return(lock.print(out)); +} + +/** Convert the member 'type_mode' into a human readable string. +@return human readable string */ +inline +std::string +ib_lock_t::type_mode_string() const +{ + std::ostringstream sout; + sout << type_string(); + sout << " | " << lock_mode_string(mode()); + + if (is_record_not_gap()) { + sout << " | LOCK_REC_NOT_GAP"; + } + + if (is_waiting()) { + sout << " | LOCK_WAIT"; + } + + if (is_gap()) { + sout << " | LOCK_GAP"; + } + + if (is_insert_intention()) { + sout << " | LOCK_INSERT_INTENTION"; + } + return(sout.str()); +} + +inline +std::ostream& +ib_lock_t::print(std::ostream& out) const +{ + out << "[lock_t: type_mode=" << type_mode << "(" + << type_mode_string() << ")"; + + if (is_record_lock()) { + out << un_member.rec_lock; + } else { + out << un_member.tab_lock; + } + + out << "]"; + return(out); +} + +inline +std::ostream& +operator<<(std::ostream& out, const ib_lock_t& lock) +{ + return(lock.print(out)); +} + +#ifdef UNIV_DEBUG +extern ibool lock_print_waits; +#endif /* UNIV_DEBUG */ + +/** Restricts the length of search we will do in the waits-for +graph of transactions */ +static const ulint LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK = 1000000; + +/** Restricts the search depth we will do in the waits-for graph of +transactions */ +static const ulint LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK = 200; + +/** When releasing transaction locks, this specifies how often we release +the lock mutex for a moment to give also others access to it */ +static const ulint LOCK_RELEASE_INTERVAL = 1000; + +/* Safety margin when creating a new record lock: this many extra records +can be inserted to the page without need to create a lock with a bigger +bitmap */ + +static const ulint LOCK_PAGE_BITMAP_MARGIN = 64; + +/* An explicit record lock affects both the record and the gap before it. +An implicit x-lock does not affect the gap, it only locks the index +record from read or update. + +If a transaction has modified or inserted an index record, then +it owns an implicit x-lock on the record. On a secondary index record, +a transaction has an implicit x-lock also if it has modified the +clustered index record, the max trx id of the page where the secondary +index record resides is >= trx id of the transaction (or database recovery +is running), and there are no explicit non-gap lock requests on the +secondary index record. + +This complicated definition for a secondary index comes from the +implementation: we want to be able to determine if a secondary index +record has an implicit x-lock, just by looking at the present clustered +index record, not at the historical versions of the record. The +complicated definition can be explained to the user so that there is +nondeterminism in the access path when a query is answered: we may, +or may not, access the clustered index record and thus may, or may not, +bump into an x-lock set there. + +Different transaction can have conflicting locks set on the gap at the +same time. The locks on the gap are purely inhibitive: an insert cannot +be made, or a select cursor may have to wait if a different transaction +has a conflicting lock on the gap. An x-lock on the gap does not give +the right to insert into the gap. + +An explicit lock can be placed on a user record or the supremum record of +a page. The locks on the supremum record are always thought to be of the gap +type, though the gap bit is not set. When we perform an update of a record +where the size of the record changes, we may temporarily store its explicit +locks on the infimum record of the page, though the infimum otherwise never +carries locks. + +A waiting record lock can also be of the gap type. A waiting lock request +can be granted when there is no conflicting mode lock request by another +transaction ahead of it in the explicit lock queue. + +In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP. +It only locks the record it is placed on, not the gap before the record. +This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation +level. + +------------------------------------------------------------------------- +RULE 1: If there is an implicit x-lock on a record, and there are non-gap +------- +lock requests waiting in the queue, then the transaction holding the implicit +x-lock also has an explicit non-gap record x-lock. Therefore, as locks are +released, we can grant locks to waiting lock requests purely by looking at +the explicit lock requests in the queue. + +RULE 3: Different transactions cannot have conflicting granted non-gap locks +------- +on a record at the same time. However, they can have conflicting granted gap +locks. +RULE 4: If a there is a waiting lock request in a queue, no lock request, +------- +gap or not, can be inserted ahead of it in the queue. In record deletes +and page splits new gap type locks can be created by the database manager +for a transaction, and without rule 4, the waits-for graph of transactions +might become cyclic without the database noticing it, as the deadlock check +is only performed when a transaction itself requests a lock! +------------------------------------------------------------------------- + +An insert is allowed to a gap if there are no explicit lock requests by +other transactions on the next record. It does not matter if these lock +requests are granted or waiting, gap bit set or not, with the exception +that a gap type request set by another transaction to wait for +its turn to do an insert is ignored. On the other hand, an +implicit x-lock by another transaction does not prevent an insert, which +allows for more concurrency when using an Oracle-style sequence number +generator for the primary key with many transactions doing inserts +concurrently. + +A modify of a record is allowed if the transaction has an x-lock on the +record, or if other transactions do not have any non-gap lock requests on the +record. + +A read of a single user record with a cursor is allowed if the transaction +has a non-gap explicit, or an implicit lock on the record, or if the other +transactions have no x-lock requests on the record. At a page supremum a +read is always allowed. + +In summary, an implicit lock is seen as a granted x-lock only on the +record, not on the gap. An explicit lock with no gap bit set is a lock +both on the record and the gap. If the gap bit is set, the lock is only +on the gap. Different transaction cannot own conflicting locks on the +record at the same time, but they may own conflicting locks on the gap. +Granted locks on a record give an access right to the record, but gap type +locks just inhibit operations. + +NOTE: Finding out if some transaction has an implicit x-lock on a secondary +index record can be cumbersome. We may have to look at previous versions of +the corresponding clustered index record to find out if a delete marked +secondary index record was delete marked by an active transaction, not by +a committed one. + +FACT A: If a transaction has inserted a row, it can delete it any time +without need to wait for locks. + +PROOF: The transaction has an implicit x-lock on every index record inserted +for the row, and can thus modify each record without the need to wait. Q.E.D. + +FACT B: If a transaction has read some result set with a cursor, it can read +it again, and retrieves the same result set, if it has not modified the +result set in the meantime. Hence, there is no phantom problem. If the +biggest record, in the alphabetical order, touched by the cursor is removed, +a lock wait may occur, otherwise not. + +PROOF: When a read cursor proceeds, it sets an s-lock on each user record +it passes, and a gap type s-lock on each page supremum. The cursor must +wait until it has these locks granted. Then no other transaction can +have a granted x-lock on any of the user records, and therefore cannot +modify the user records. Neither can any other transaction insert into +the gaps which were passed over by the cursor. Page splits and merges, +and removal of obsolete versions of records do not affect this, because +when a user record or a page supremum is removed, the next record inherits +its locks as gap type locks, and therefore blocks inserts to the same gap. +Also, if a page supremum is inserted, it inherits its locks from the successor +record. When the cursor is positioned again at the start of the result set, +the records it will touch on its course are either records it touched +during the last pass or new inserted page supremums. It can immediately +access all these records, and when it arrives at the biggest record, it +notices that the result set is complete. If the biggest record was removed, +lock wait can occur because the next record only inherits a gap type lock, +and a wait may be needed. Q.E.D. */ + +/* If an index record should be changed or a new inserted, we must check +the lock on the record or the next. When a read cursor starts reading, +we will set a record level s-lock on each record it passes, except on the +initial record on which the cursor is positioned before we start to fetch +records. Our index tree search has the convention that the B-tree +cursor is positioned BEFORE the first possibly matching record in +the search. Optimizations are possible here: if the record is searched +on an equality condition to a unique key, we could actually set a special +lock on the record, a lock which would not prevent any insert before +this record. In the next key locking an x-lock set on a record also +prevents inserts just before that record. + There are special infimum and supremum records on each page. +A supremum record can be locked by a read cursor. This records cannot be +updated but the lock prevents insert of a user record to the end of +the page. + Next key locks will prevent the phantom problem where new rows +could appear to SELECT result sets after the select operation has been +performed. Prevention of phantoms ensures the serilizability of +transactions. + What should we check if an insert of a new record is wanted? +Only the lock on the next record on the same page, because also the +supremum record can carry a lock. An s-lock prevents insertion, but +what about an x-lock? If it was set by a searched update, then there +is implicitly an s-lock, too, and the insert should be prevented. +What if our transaction owns an x-lock to the next record, but there is +a waiting s-lock request on the next record? If this s-lock was placed +by a read cursor moving in the ascending order in the index, we cannot +do the insert immediately, because when we finally commit our transaction, +the read cursor should see also the new inserted record. So we should +move the read cursor backward from the next record for it to pass over +the new inserted record. This move backward may be too cumbersome to +implement. If we in this situation just enqueue a second x-lock request +for our transaction on the next record, then the deadlock mechanism +notices a deadlock between our transaction and the s-lock request +transaction. This seems to be an ok solution. + We could have the convention that granted explicit record locks, +lock the corresponding records from changing, and also lock the gaps +before them from inserting. A waiting explicit lock request locks the gap +before from inserting. Implicit record x-locks, which we derive from the +transaction id in the clustered index record, only lock the record itself +from modification, not the gap before it from inserting. + How should we store update locks? If the search is done by a unique +key, we could just modify the record trx id. Otherwise, we could put a record +x-lock on the record. If the update changes ordering fields of the +clustered index record, the inserted new record needs no record lock in +lock table, the trx id is enough. The same holds for a secondary index +record. Searched delete is similar to update. + +PROBLEM: +What about waiting lock requests? If a transaction is waiting to make an +update to a record which another modified, how does the other transaction +know to send the end-lock-wait signal to the waiting transaction? If we have +the convention that a transaction may wait for just one lock at a time, how +do we preserve it if lock wait ends? + +PROBLEM: +Checking the trx id label of a secondary index record. In the case of a +modification, not an insert, is this necessary? A secondary index record +is modified only by setting or resetting its deleted flag. A secondary index +record contains fields to uniquely determine the corresponding clustered +index record. A secondary index record is therefore only modified if we +also modify the clustered index record, and the trx id checking is done +on the clustered index record, before we come to modify the secondary index +record. So, in the case of delete marking or unmarking a secondary index +record, we do not have to care about trx ids, only the locks in the lock +table must be checked. In the case of a select from a secondary index, the +trx id is relevant, and in this case we may have to search the clustered +index record. + +PROBLEM: How to update record locks when page is split or merged, or +-------------------------------------------------------------------- +a record is deleted or updated? +If the size of fields in a record changes, we perform the update by +a delete followed by an insert. How can we retain the locks set or +waiting on the record? Because a record lock is indexed in the bitmap +by the heap number of the record, when we remove the record from the +record list, it is possible still to keep the lock bits. If the page +is reorganized, we could make a table of old and new heap numbers, +and permute the bitmaps in the locks accordingly. We can add to the +table a row telling where the updated record ended. If the update does +not require a reorganization of the page, we can simply move the lock +bits for the updated record to the position determined by its new heap +number (we may have to allocate a new lock, if we run out of the bitmap +in the old one). + A more complicated case is the one where the reinsertion of the +updated record is done pessimistically, because the structure of the +tree may change. + +PROBLEM: If a supremum record is removed in a page merge, or a record +--------------------------------------------------------------------- +removed in a purge, what to do to the waiting lock requests? In a split to +the right, we just move the lock requests to the new supremum. If a record +is removed, we could move the waiting lock request to its inheritor, the +next record in the index. But, the next record may already have lock +requests on its own queue. A new deadlock check should be made then. Maybe +it is easier just to release the waiting transactions. They can then enqueue +new lock requests on appropriate records. + +PROBLEM: When a record is inserted, what locks should it inherit from the +------------------------------------------------------------------------- +upper neighbor? An insert of a new supremum record in a page split is +always possible, but an insert of a new user record requires that the upper +neighbor does not have any lock requests by other transactions, granted or +waiting, in its lock queue. Solution: We can copy the locks as gap type +locks, so that also the waiting locks are transformed to granted gap type +locks on the inserted record. */ + +/* LOCK COMPATIBILITY MATRIX + * IS IX S X AI + * IS + + + - + + * IX + + - - + + * S + - + - - + * X - - - - - + * AI + + - - - + * + * Note that for rows, InnoDB only acquires S or X locks. + * For tables, InnoDB normally acquires IS or IX locks. + * S or X table locks are only acquired for LOCK TABLES. + * Auto-increment (AI) locks are needed because of + * statement-level MySQL binlog. + * See also lock_mode_compatible(). + */ +static const byte lock_compatibility_matrix[5][5] = { + /** IS IX S X AI */ + /* IS */ { TRUE, TRUE, TRUE, FALSE, TRUE}, + /* IX */ { TRUE, TRUE, FALSE, FALSE, TRUE}, + /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE}, + /* X */ { FALSE, FALSE, FALSE, FALSE, FALSE}, + /* AI */ { TRUE, TRUE, FALSE, FALSE, FALSE} +}; + +/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column) + * IS IX S X AI + * IS + - - - - + * IX + + - - - + * S + - + - - + * X + + + + + + * AI - - - - + + * See lock_mode_stronger_or_eq(). + */ +static const byte lock_strength_matrix[5][5] = { + /** IS IX S X AI */ + /* IS */ { TRUE, FALSE, FALSE, FALSE, FALSE}, + /* IX */ { TRUE, TRUE, FALSE, FALSE, FALSE}, + /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE}, + /* X */ { TRUE, TRUE, TRUE, TRUE, TRUE}, + /* AI */ { FALSE, FALSE, FALSE, FALSE, TRUE} +}; + +/** Maximum depth of the DFS stack. */ +static const ulint MAX_STACK_SIZE = 4096; + +#define PRDT_HEAPNO PAGE_HEAP_NO_INFIMUM +/** Record locking request status */ +enum lock_rec_req_status { + /** Failed to acquire a lock */ + LOCK_REC_FAIL, + /** Succeeded in acquiring a lock (implicit or already acquired) */ + LOCK_REC_SUCCESS, + /** Explicitly created a new lock */ + LOCK_REC_SUCCESS_CREATED +}; + +#ifdef UNIV_DEBUG +/** The count of the types of locks. */ +static const ulint lock_types = UT_ARR_SIZE(lock_compatibility_matrix); +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the type of a lock. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Gets the previous record lock set on a record. +@return previous lock on the same record, NULL if none exists */ +const lock_t* +lock_rec_get_prev( +/*==============*/ + const lock_t* in_lock,/*!< in: record lock */ + ulint heap_no);/*!< in: heap number of the record */ + +/*********************************************************************//** +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock); /*!< in/out: waiting lock request */ + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_on_page_const( +/*============================*/ + const lock_t* lock); /*!< in: a record lock */ + +/*********************************************************************//** +Gets the nth bit of a record lock. +@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/ +UNIV_INLINE +ibool +lock_rec_get_nth_bit( +/*=================*/ + const lock_t* lock, /*!< in: record lock */ + ulint i); /*!< in: index of the bit */ + +/*********************************************************************//** +Gets the number of bits in a record lock bitmap. +@return number of bits */ +UNIV_INLINE +ulint +lock_rec_get_n_bits( +/*================*/ + const lock_t* lock); /*!< in: record lock */ + +/**********************************************************************//** +Sets the nth bit of a record lock to TRUE. */ +UNIV_INLINE +void +lock_rec_set_nth_bit( +/*=================*/ + lock_t* lock, /*!< in: record lock */ + ulint i); /*!< in: index of the bit */ + +/** Reset the nth bit of a record lock. +@param[in,out] lock record lock +@param[in] i index of the bit that will be reset +@return previous value of the bit */ +inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i) +{ + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte* b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3); + byte mask = byte(1U << (i & 7)); + byte bit = *b & mask; + *b &= byte(~mask); + + if (bit != 0) { + ut_ad(lock->trx->lock.n_rec_locks > 0); + --lock->trx->lock.n_rec_locks; + } + + return(bit); +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_next_on_page( +/*======================*/ + lock_t* lock); /*!< in: a record lock */ + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +lock_t* +lock_rec_get_next( +/*==============*/ + ulint heap_no,/*!< in: heap number of the record */ + lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_const( +/*====================*/ + ulint heap_no,/*!< in: heap number of the record */ + const lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Gets the first explicit lock request on a record. +@return first lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_first( +/*===============*/ + hash_table_t* hash, /*!< in: hash chain the lock on */ + const buf_block_t* block, /*!< in: block containing the record */ + ulint heap_no);/*!< in: heap number of the record */ + +/*********************************************************************//** +Gets the mode of a lock. +@return mode */ +UNIV_INLINE +enum lock_mode +lock_get_mode( +/*==========*/ + const lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Calculates if lock mode 1 is compatible with lock mode 2. +@return nonzero if mode1 compatible with mode2 */ +UNIV_INLINE +ulint +lock_mode_compatible( +/*=================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2); /*!< in: lock mode */ + +/*********************************************************************//** +Calculates if lock mode 1 is stronger or equal to lock mode 2. +@return nonzero if mode1 stronger or equal to mode2 */ +UNIV_INLINE +ulint +lock_mode_stronger_or_eq( +/*=====================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2); /*!< in: lock mode */ + +/*********************************************************************//** +Gets the wait flag of a lock. +@return LOCK_WAIT if waiting, 0 if not */ +UNIV_INLINE +ulint +lock_get_wait( +/*==========*/ + const lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Checks if a transaction has the specified table lock, or stronger. This +function should only be called by the thread that owns the transaction. +@return lock or NULL */ +UNIV_INLINE +const lock_t* +lock_table_has( +/*===========*/ + const trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table */ + enum lock_mode mode); /*!< in: lock mode */ + +/** Set the wait status of a lock. +@param[in,out] lock lock that will be waited for +@param[in,out] trx transaction that will wait for the lock */ +inline void lock_set_lock_and_trx_wait(lock_t* lock, trx_t* trx) +{ + ut_ad(lock); + ut_ad(lock->trx == trx); + ut_ad(trx->lock.wait_lock == NULL); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(trx)); + + trx->lock.wait_lock = lock; + lock->type_mode |= LOCK_WAIT; +} + +/** Reset the wait status of a lock. +@param[in,out] lock lock that was possibly being waited for */ +inline void lock_reset_lock_and_trx_wait(lock_t* lock) +{ + ut_ad(lock_get_wait(lock)); + ut_ad(lock_mutex_own()); + ut_ad(lock->trx->lock.wait_lock == NULL + || lock->trx->lock.wait_lock == lock); + lock->trx->lock.wait_lock = NULL; + lock->type_mode &= ~LOCK_WAIT; +} + +#include "lock0priv.ic" + +#endif /* lock0priv_h */ diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic new file mode 100644 index 00000000..e16949a4 --- /dev/null +++ b/storage/innobase/include/lock0priv.ic @@ -0,0 +1,321 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.ic +Lock module internal inline methods. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +/* This file contains only methods which are used in +lock/lock0* files, other than lock/lock0lock.cc. +I.e. lock/lock0lock.cc contains more internal inline +methods but they are used only in that file. */ + +#ifndef LOCK_MODULE_IMPLEMENTATION +#error Do not include lock0priv.ic outside of the lock/ module +#endif + +#include "row0row.h" + +/*********************************************************************//** +Gets the type of a lock. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_TYPE_MASK); +} + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + + return(row_get_rec_trx_id(rec, index, offsets)); +} + +/*********************************************************************//** +Gets the number of bits in a record lock bitmap. +@return number of bits */ +UNIV_INLINE +ulint +lock_rec_get_n_bits( +/*================*/ + const lock_t* lock) /*!< in: record lock */ +{ + return(lock->un_member.rec_lock.n_bits); +} + +/**********************************************************************//** +Sets the nth bit of a record lock to TRUE. */ +UNIV_INLINE +void +lock_rec_set_nth_bit( +/*=================*/ + lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte_index = i / 8; + bit_index = i % 8; + +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */ +#endif + ((byte*) &lock[1])[byte_index] |= static_cast<byte>(1 << bit_index); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + ++lock->trx->lock.n_rec_locks; +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_next_on_page( +/*======================*/ + lock_t* lock) /*!< in: a record lock */ +{ + return((lock_t*) lock_rec_get_next_on_page_const(lock)); +} + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +lock_t* +lock_rec_get_next( +/*==============*/ + ulint heap_no,/*!< in: heap number of the record */ + lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock_mutex_own()); + + do { + ut_ad(lock_get_type_low(lock) == LOCK_REC); + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, heap_no)); + + return(lock); +} + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_const( +/*====================*/ + ulint heap_no,/*!< in: heap number of the record */ + const lock_t* lock) /*!< in: lock */ +{ + return(lock_rec_get_next(heap_no, (lock_t*) lock)); +} + +/*********************************************************************//** +Gets the first explicit lock request on a record. +@return first lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_first( +/*===============*/ + hash_table_t* hash, /*!< in: hash chain the lock on */ + const buf_block_t* block, /*!< in: block containing the record */ + ulint heap_no)/*!< in: heap number of the record */ +{ + for (lock_t *lock= lock_sys.get_first(*hash, block->page.id()); + lock; lock= lock_rec_get_next_on_page(lock)) + if (lock_rec_get_nth_bit(lock, heap_no)) + return lock; + return nullptr; +} + +/*********************************************************************//** +Gets the nth bit of a record lock. +@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/ +UNIV_INLINE +ibool +lock_rec_get_nth_bit( +/*=================*/ + const lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit */ +{ + const byte* b; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + if (i >= lock->un_member.rec_lock.n_bits) { + + return(FALSE); + } + + b = ((const byte*) &lock[1]) + (i / 8); + + return(1 & *b >> (i % 8)); +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_on_page_const( +/*============================*/ + const lock_t* lock) /*!< in: a record lock */ +{ + ut_ad(lock_mutex_own()); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + const page_id_t page_id(lock->un_member.rec_lock.page_id); + + while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock)))) + if (lock->un_member.rec_lock.page_id == page_id) + break; + return lock; +} + +/*********************************************************************//** +Gets the mode of a lock. +@return mode */ +UNIV_INLINE +enum lock_mode +lock_get_mode( +/*==========*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK)); +} + +/*********************************************************************//** +Calculates if lock mode 1 is compatible with lock mode 2. +@return nonzero if mode1 compatible with mode2 */ +UNIV_INLINE +ulint +lock_mode_compatible( +/*=================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2) /*!< in: lock mode */ +{ + ut_ad((ulint) mode1 < lock_types); + ut_ad((ulint) mode2 < lock_types); + + return(lock_compatibility_matrix[mode1][mode2]); +} + +/*********************************************************************//** +Calculates if lock mode 1 is stronger or equal to lock mode 2. +@return nonzero if mode1 stronger or equal to mode2 */ +UNIV_INLINE +ulint +lock_mode_stronger_or_eq( +/*=====================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2) /*!< in: lock mode */ +{ + ut_ad((ulint) mode1 < lock_types); + ut_ad((ulint) mode2 < lock_types); + + return(lock_strength_matrix[mode1][mode2]); +} + +/*********************************************************************//** +Gets the wait flag of a lock. +@return LOCK_WAIT if waiting, 0 if not */ +UNIV_INLINE +ulint +lock_get_wait( +/*==========*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_WAIT); +} + +/*********************************************************************//** +Checks if a transaction has the specified table lock, or stronger. This +function should only be called by the thread that owns the transaction. +@return lock or NULL */ +UNIV_INLINE +const lock_t* +lock_table_has( +/*===========*/ + const trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table */ + lock_mode in_mode)/*!< in: lock mode */ +{ + /* Look for stronger locks the same trx already has on the table */ + + for (lock_list::const_iterator it = trx->lock.table_locks.begin(), + end = trx->lock.table_locks.end(); it != end; ++it) { + + const lock_t* lock = *it; + + if (lock == NULL) { + continue; + } + + lock_mode mode = lock_get_mode(lock); + + ut_ad(trx == lock->trx); + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + ut_ad(lock->un_member.tab_lock.table != NULL); + + if (table == lock->un_member.tab_lock.table + && lock_mode_stronger_or_eq(mode, in_mode)) { + + ut_ad(!lock_get_wait(lock)); + + return(lock); + } + } + + return(NULL); +} + +/* vim: set filetype=c: */ diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h new file mode 100644 index 00000000..23307375 --- /dev/null +++ b/storage/innobase/include/lock0types.h @@ -0,0 +1,273 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0types.h +The transaction lock system global types + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0types.h" +#include "buf0types.h" +#include "ut0lst.h" + +#ifndef lock0types_h +#define lock0types_h + +#define lock_t ib_lock_t + +struct lock_t; +struct lock_table_t; + +/* Basic lock modes */ +enum lock_mode { + LOCK_IS = 0, /* intention shared */ + LOCK_IX, /* intention exclusive */ + LOCK_S, /* shared */ + LOCK_X, /* exclusive */ + LOCK_AUTO_INC, /* locks the auto-inc counter of a table + in an exclusive mode */ + LOCK_NONE, /* this is used elsewhere to note consistent read */ + LOCK_NUM = LOCK_NONE, /* number of lock modes */ + LOCK_NONE_UNSET = 255 +}; + +/** Convert the given enum value into string. +@param[in] mode the lock mode +@return human readable string of the given enum value */ +inline +const char* lock_mode_string(enum lock_mode mode) +{ + switch (mode) { + case LOCK_IS: + return("LOCK_IS"); + case LOCK_IX: + return("LOCK_IX"); + case LOCK_S: + return("LOCK_S"); + case LOCK_X: + return("LOCK_X"); + case LOCK_AUTO_INC: + return("LOCK_AUTO_INC"); + case LOCK_NONE: + return("LOCK_NONE"); + case LOCK_NONE_UNSET: + return("LOCK_NONE_UNSET"); + default: + ut_error; + } +} + +/** A table lock */ +struct lock_table_t { + dict_table_t* table; /*!< database table in dictionary + cache */ + UT_LIST_NODE_T(ib_lock_t) + locks; /*!< list of locks on the same + table */ + /** Print the table lock into the given output stream + @param[in,out] out the output stream + @return the given output stream. */ + std::ostream& print(std::ostream& out) const; +}; + +/** Record lock for a page */ +struct lock_rec_t { + /** page identifier */ + page_id_t page_id; + ib_uint32_t n_bits; /*!< number of bits in the lock + bitmap; NOTE: the lock bitmap is + placed immediately after the + lock struct */ + + /** Print the record lock into the given output stream + @param[in,out] out the output stream + @return the given output stream. */ + std::ostream& print(std::ostream& out) const; +}; + +/** Print the record lock into the given output stream +@param[in,out] out the output stream +@return the given output stream. */ +inline std::ostream &lock_rec_t::print(std::ostream &out) const +{ + out << "[lock_rec_t: space=" << page_id.space() + << ", page_no=" << page_id.page_no() + << ", n_bits=" << n_bits << "]"; + return out; +} + +inline +std::ostream& +operator<<(std::ostream& out, const lock_rec_t& lock) +{ + return(lock.print(out)); +} + +#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the + type_mode field in a lock */ +/** Lock types */ +/* @{ */ +#define LOCK_TABLE 16U /*!< table lock */ +#define LOCK_REC 32U /*!< record lock */ +#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the + type_mode field in a lock */ +#if LOCK_MODE_MASK & LOCK_TYPE_MASK +# error "LOCK_MODE_MASK & LOCK_TYPE_MASK" +#endif + +#define LOCK_WAIT 256U /*!< Waiting lock flag; when set, it + means that the lock has not yet been + granted, it is just waiting for its + turn in the wait queue */ +/* Precise modes */ +#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary + next-key lock in contrast to LOCK_GAP + or LOCK_REC_NOT_GAP */ +#define LOCK_GAP 512U /*!< when this bit is set, it means that the + lock holds only on the gap before the record; + for instance, an x-lock on the gap does not + give permission to modify the record on which + the bit is set; locks of this type are created + when records are removed from the index chain + of records */ +#define LOCK_REC_NOT_GAP 1024U /*!< this bit means that the lock is only on + the index record and does NOT block inserts + to the gap before the index record; this is + used in the case when we retrieve a record + with a unique key, and is also used in + locking plain SELECTs (not part of UPDATE + or DELETE) when the user has set the READ + COMMITTED isolation level */ +#define LOCK_INSERT_INTENTION 2048U/*!< this bit is set when we place a waiting + gap type record lock request in order to let + an insert of an index record to wait until + there are no conflicting locks by other + transactions on the gap; note that this flag + remains set when the waiting lock is granted, + or if the lock is inherited to a neighboring + record */ +#define LOCK_PREDICATE 8192U /*!< Predicate lock */ +#define LOCK_PRDT_PAGE 16384U /*!< Page lock */ + + +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_MODE_MASK +# error +#endif +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_TYPE_MASK +# error +#endif +/* @} */ + +/** Lock struct; protected by lock_sys.mutex */ +struct ib_lock_t +{ + trx_t* trx; /*!< transaction owning the + lock */ + UT_LIST_NODE_T(ib_lock_t) + trx_locks; /*!< list of the locks of the + transaction */ + + dict_index_t* index; /*!< index for a record lock */ + + ib_lock_t* hash; /*!< hash chain node for a record + lock. The link node in a singly linked + list, used during hashing. */ + + /** time(NULL) of the lock request creation. + Used for computing wait_time and diagnostics only. + Note: bogus durations may be reported + when the system time is adjusted! */ + time_t requested_time; + /** Cumulated wait time in seconds. + Note: may be bogus when the system time is adjusted! */ + ulint wait_time; + + union { + lock_table_t tab_lock;/*!< table lock */ + lock_rec_t rec_lock;/*!< record lock */ + } un_member; /*!< lock details */ + + ib_uint32_t type_mode; /*!< lock type, mode, LOCK_GAP or + LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION, + wait flag, ORed */ + + /** Determine if the lock object is a record lock. + @return true if record lock, false otherwise. */ + bool is_record_lock() const + { + return(type() == LOCK_REC); + } + + bool is_waiting() const + { + return(type_mode & LOCK_WAIT); + } + + bool is_gap() const + { + return(type_mode & LOCK_GAP); + } + + bool is_record_not_gap() const + { + return(type_mode & LOCK_REC_NOT_GAP); + } + + bool is_insert_intention() const + { + return(type_mode & LOCK_INSERT_INTENTION); + } + + ulint type() const { + return(type_mode & LOCK_TYPE_MASK); + } + + enum lock_mode mode() const + { + return(static_cast<enum lock_mode>(type_mode & LOCK_MODE_MASK)); + } + + /** Print the lock object into the given output stream. + @param[in,out] out the output stream + @return the given output stream. */ + std::ostream& print(std::ostream& out) const; + + /** Convert the member 'type_mode' into a human readable string. + @return human readable string */ + std::string type_mode_string() const; + + const char* type_string() const + { + switch (type_mode & LOCK_TYPE_MASK) { + case LOCK_REC: + return("LOCK_REC"); + case LOCK_TABLE: + return("LOCK_TABLE"); + default: + ut_error; + } + } +}; + +typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t; + +#endif /* lock0types_h */ diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h new file mode 100644 index 00000000..980a79d8 --- /dev/null +++ b/storage/innobase/include/log0crypt.h @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. +Copyright (C) 2014, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file include/log0crypt.h +Innodb log encrypt/decrypt + +Created 11/25/2013 Minli Zhu +Modified Jan Lindström jan.lindstrom@mariadb.com +MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. +*******************************************************/ +#ifndef log0crypt_h +#define log0crypt_h + +#include "log0log.h" + +/** innodb_encrypt_log: whether to encrypt the redo log */ +extern my_bool srv_encrypt_log; + +/** Initialize the redo log encryption key and random parameters +when creating a new redo log. +The random parameters will be persisted in the log checkpoint pages. +@see log_crypt_write_checkpoint_buf() +@see log_crypt_read_checkpoint_buf() +@return whether the operation succeeded */ +UNIV_INTERN +bool +log_crypt_init(); + +/*********************************************************************//** +Writes the crypto (version, msg and iv) info, which has been used for +log blocks with lsn <= this checkpoint's lsn, to a log header's +checkpoint buf. */ +UNIV_INTERN +void +log_crypt_write_checkpoint_buf( +/*===========================*/ + byte* buf); /*!< in/out: checkpoint buffer */ + +/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info. +@param[in] buf checkpoint buffer +@return whether the operation was successful */ +ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf); + +/** Decrypt a MariaDB 10.1 redo log block. +@param[in,out] buf log block +@param[in] start_lsn server start LSN +@return whether the decryption was successful */ +ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn); + +/** Read the checkpoint crypto (version, msg and iv) info. +@param[in] buf checkpoint buffer +@return whether the operation was successful */ +bool log_crypt_read_checkpoint_buf(const byte* buf); + +/** log_crypt() operation code */ +enum log_crypt_t { + /** encrypt a log block without rotating key */ + LOG_ENCRYPT, + /** decrypt a log block */ + LOG_DECRYPT, + /** attempt to rotate the key, and encrypt a log block */ + LOG_ENCRYPT_ROTATE_KEY +}; + +/** Encrypt or decrypt log blocks. +@param[in,out] buf log blocks to encrypt or decrypt +@param[in] lsn log sequence number of the start of the buffer +@param[in] size size of the buffer, in bytes +@param[in] op whether to decrypt, encrypt, or rotate key and encrypt +@return whether the operation succeeded (encrypt always does) */ +bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT); + +/** Encrypt or decrypt a temporary file block. +@param[in] src block to encrypt or decrypt +@param[in] size size of the block +@param[out] dst destination block +@param[in] offs offset to block +@param[in] encrypt true=encrypt; false=decrypt +@return whether the operation succeeded */ +UNIV_INTERN +bool +log_tmp_block_encrypt( + const byte* src, + ulint size, + byte* dst, + uint64_t offs, + bool encrypt = true) + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Decrypt a temporary file block. +@param[in] src block to decrypt +@param[in] size size of the block +@param[out] dst destination block +@param[in] offs offset to block +@return whether the operation succeeded */ +inline +bool +log_tmp_block_decrypt( + const byte* src, + ulint size, + byte* dst, + uint64_t offs) +{ + return(log_tmp_block_encrypt(src, size, dst, offs, false)); +} + +/** @return whether temporary files are encrypted */ +inline bool log_tmp_is_encrypted() { return srv_encrypt_log; } +#endif // log0crypt.h diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h new file mode 100644 index 00000000..460acaf5 --- /dev/null +++ b/storage/innobase/include/log0log.h @@ -0,0 +1,751 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2009, Google Inc. +Copyright (c) 2017, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.h +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#ifndef log0log_h +#define log0log_h + +#include "log0types.h" +#include "os0file.h" +#include "span.h" +#include "my_atomic_wrapper.h" +#include <vector> +#include <string> + +using st_::span; + +static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile"; +static const char LOG_FILE_NAME[] = "ib_logfile0"; + +/** Composes full path for a redo log file +@param[in] filename name of the redo log file +@return path with log file name*/ +std::string get_log_file_path(const char *filename= LOG_FILE_NAME); + +/** Returns paths for all existing log files */ +std::vector<std::string> get_existing_log_files_paths(); + +/** Delete log file. +@param[in] suffix suffix of the file name */ +static inline void delete_log_file(const char* suffix) +{ + auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix); + os_file_delete_if_exists(innodb_log_file_key, path.c_str(), nullptr); +} + +/** Append a string to the log. +@param[in] str string +@param[in] len string length +@param[out] start_lsn start LSN of the log record +@return end lsn of the log record, zero if did not succeed */ +UNIV_INLINE +lsn_t +log_reserve_and_write_fast( + const void* str, + ulint len, + lsn_t* start_lsn); +/***********************************************************************//** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void); +/*================*/ + +/** Extends the log buffer. +@param[in] len requested minimum size in bytes */ +void log_buffer_extend(ulong len); + +/** Calculate the recommended highest values for lsn - last_checkpoint_lsn +and lsn - buf_pool.get_oldest_modification(). +@param[in] file_size requested innodb_log_file_size +@retval true on success +@retval false if the smallest log is too small to +accommodate the number of OS threads in the database server */ +bool +log_set_capacity(ulonglong file_size) + MY_ATTRIBUTE((warn_unused_result)); + +/** Ensure that the log has been written to the log file up to a given +log entry (such as that of a transaction commit). Start a new write, or +wait and check if an already running write is covering the request. +@param[in] lsn log sequence number that should be +included in the redo log file write +@param[in] flush_to_disk whether the written log should also +be flushed to the file system +@param[in] rotate_key whether to rotate the encryption key */ +void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false); + +/** write to the log file up to the last log entry. +@param[in] sync whether we want the written log +also to be flushed to disk. */ +void +log_buffer_flush_to_disk( + bool sync = true); + +/** Make a checkpoint */ +ATTRIBUTE_COLD void log_make_checkpoint(); + +/** Make a checkpoint at the latest lsn on shutdown. */ +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); + +/** Write checkpoint info to the log header and release log_sys.mutex. +@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ +ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn); + +/** +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +ATTRIBUTE_COLD void log_check_margins(); + +/************************************************************//** +Gets a log block flush bit. +@return TRUE if this block was the first to be written in a log flush */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Gets a log block number stored in the header. +@return log block number stored in the block header */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Gets a log block data length. +@return log block data length measured as a byte offset from the block start */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint len); /*!< in: data length */ +/** Calculate the CRC-32C checksum of a log block. +@param[in] block log block +@return checksum */ +inline ulint log_block_calc_checksum_crc32(const byte* block); + +/************************************************************//** +Gets a log block checksum field value. +@return checksum */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint checksum); /*!< in: checksum */ +/************************************************************//** +Gets a log block first mtr log record group offset. +@return first mtr log record group byte offset from the block start, 0 +if none */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /*!< in/out: log block */ + ulint offset); /*!< in: offset, 0 if none */ +/************************************************************//** +Gets a log block checkpoint number field (4 lowest bytes). +@return checkpoint no (4 lowest bytes) */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn); /*!< in: lsn within the log block */ +/************************************************************//** +Converts a lsn to a log block number. +@return log block number, it is > 0 and <= 1G */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + lsn_t lsn); /*!< in: lsn of a byte within the block */ +/******************************************************//** +Prints info of the log. */ +void +log_print( +/*======*/ + FILE* file); /*!< in: file where to print */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +void +log_refresh_stats(void); +/*===================*/ + +/* The counting of lsn's starts from this value: this must be non-zero */ +#define LOG_START_LSN ((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE)) + +/* Offsets of a log block header */ +#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and + is allowed to wrap around at 2G; the + highest bit is set to 1 if this is the + first log block in a log flush write + segment */ +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL + /* mask used to get the highest bit in + the preceding field */ +#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to + this block */ +#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an + mtr log record group in this log block, + 0 if none; if the value is the same + as LOG_BLOCK_HDR_DATA_LEN, it means + that the first rec group has not yet + been catenated to this log block, but + if it will, it will start at this + offset; an archive recovery can + start parsing the log records starting + from this offset in this log block, + if value not 0 */ +#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of + log_sys.next_checkpoint_no when the + log block was last written to: if the + block has not yet been written full, + this value is only updated before a + log buffer flush */ +#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in + bytes */ + +#define LOG_BLOCK_KEY 4 /* encryption key version + before LOG_BLOCK_CHECKSUM; + after log_t::FORMAT_ENC_10_4 only */ +#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block + contents; in InnoDB versions + < 3.23.52 this did not contain the + checksum but the same value as + LOG_BLOCK_HDR_NO */ + +/** Offsets inside the checkpoint pages (redo log format version 1) @{ */ +/** Checkpoint number */ +#define LOG_CHECKPOINT_NO 0 +/** Log sequence number up to which all changes have been flushed */ +#define LOG_CHECKPOINT_LSN 8 +/** Byte offset of the log record corresponding to LOG_CHECKPOINT_LSN */ +#define LOG_CHECKPOINT_OFFSET 16 +/** srv_log_buffer_size at the time of the checkpoint (not used) */ +#define LOG_CHECKPOINT_LOG_BUF_SIZE 24 +/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/ +#define LOG_CHECKPOINT_CRYPT_KEY 32 +/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */ +#define LOG_CHECKPOINT_CRYPT_NONCE 36 +/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */ +#define LOG_CHECKPOINT_CRYPT_MESSAGE 40 +/** start LSN of the MLOG_CHECKPOINT mini-transaction corresponding +to this checkpoint, or 0 if the information has not been written */ +#define LOG_CHECKPOINT_END_LSN OS_FILE_LOG_BLOCK_SIZE - 16 + +/* @} */ + +/** Offsets of a log file header */ +/* @{ */ +/** Log file header format identifier (32-bit unsigned big-endian integer). +This used to be called LOG_GROUP_ID and always written as 0, +because InnoDB never supported more than one copy of the redo log. */ +#define LOG_HEADER_FORMAT 0 +/** Redo log subformat (originally 0). In format version 0, the +LOG_FILE_START_LSN started here, 4 bytes earlier than LOG_HEADER_START_LSN, +which the LOG_FILE_START_LSN was renamed to. +Subformat 1 is for the fully redo-logged TRUNCATE +(no MLOG_TRUNCATE records or extra log checkpoints or log file) */ +#define LOG_HEADER_SUBFORMAT 4 +/** LSN of the start of data in this log file (with format version 1; +in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */ +#define LOG_HEADER_START_LSN 8 +/** A null-terminated string which will contain either the string 'ibbackup' +and the creation time if the log file was created by mysqlbackup --restore, +or the MySQL version that created the redo log file. */ +#define LOG_HEADER_CREATOR 16 +/** End of the log file creator field. */ +#define LOG_HEADER_CREATOR_END (LOG_HEADER_CREATOR + 32) +/** Contents of the LOG_HEADER_CREATOR field */ +#define LOG_HEADER_CREATOR_CURRENT \ + "MariaDB " \ + IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ + IB_TO_STR(MYSQL_VERSION_MINOR) "." \ + IB_TO_STR(MYSQL_VERSION_PATCH) + +/* @} */ + +#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE + /* first checkpoint field in the log + header; we write alternately to the + checkpoint fields when we make new + checkpoints; this field is only defined + in the first log file of a log */ +#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE) + /* second checkpoint field in the log + header */ +#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE) + +/** Memory mapped file */ +class mapped_file_t +{ +public: + mapped_file_t()= default; + mapped_file_t(const mapped_file_t &)= delete; + mapped_file_t &operator=(const mapped_file_t &)= delete; + mapped_file_t(mapped_file_t &&)= delete; + mapped_file_t &operator=(mapped_file_t &&)= delete; + ~mapped_file_t() noexcept; + + dberr_t map(const char *path, bool read_only= false, + bool nvme= false) noexcept; + dberr_t unmap() noexcept; + byte *data() noexcept { return m_area.data(); } + +private: + span<byte> m_area; +}; + +/** Abstraction for reading, writing and flushing file cache to disk */ +class file_io +{ +public: + file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {} + virtual ~file_io() noexcept {}; + virtual dberr_t open(const char *path, bool read_only) noexcept= 0; + virtual dberr_t rename(const char *old_path, + const char *new_path) noexcept= 0; + virtual dberr_t close() noexcept= 0; + virtual dberr_t read(os_offset_t offset, span<byte> buf) noexcept= 0; + virtual dberr_t write(const char *path, os_offset_t offset, + span<const byte> buf) noexcept= 0; + virtual dberr_t flush() noexcept= 0; + + /** Durable writes doesn't require calling flush() */ + bool writes_are_durable() const noexcept { return m_durable_writes; } + +protected: + bool m_durable_writes; +}; + +class file_os_io final: public file_io +{ +public: + file_os_io()= default; + file_os_io(const file_os_io &)= delete; + file_os_io &operator=(const file_os_io &)= delete; + file_os_io(file_os_io &&rhs); + file_os_io &operator=(file_os_io &&rhs); + ~file_os_io() noexcept; + + dberr_t open(const char *path, bool read_only) noexcept final; + bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; } + dberr_t rename(const char *old_path, const char *new_path) noexcept final; + dberr_t close() noexcept final; + dberr_t read(os_offset_t offset, span<byte> buf) noexcept final; + dberr_t write(const char *path, os_offset_t offset, + span<const byte> buf) noexcept final; + dberr_t flush() noexcept final; + +private: + pfs_os_file_t m_fd{OS_FILE_CLOSED}; +}; + +/** File abstraction + path */ +class log_file_t +{ +public: + log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {} + + dberr_t open(bool read_only) noexcept; + bool is_opened() const noexcept; + + const std::string &get_path() const noexcept { return m_path; } + + dberr_t rename(std::string new_path) noexcept; + dberr_t close() noexcept; + dberr_t read(os_offset_t offset, span<byte> buf) noexcept; + bool writes_are_durable() const noexcept; + dberr_t write(os_offset_t offset, span<const byte> buf) noexcept; + dberr_t flush() noexcept; + void free() + { + m_path.clear(); + m_path.shrink_to_fit(); + } + +private: + std::unique_ptr<file_io> m_file; + std::string m_path; +}; + +/** Redo log buffer */ +struct log_t{ + /** The original (not version-tagged) InnoDB redo log format */ + static constexpr uint32_t FORMAT_3_23 = 0; + /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ + static constexpr uint32_t FORMAT_10_2 = 1; + /** The MariaDB 10.3.2 log format. + To prevent crash-downgrade to earlier 10.2 due to the inability to + roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, + MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT + 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 + (MDEV-13564 backup-friendly TRUNCATE). */ + static constexpr uint32_t FORMAT_10_3 = 103; + /** The MariaDB 10.4.0 log format. */ + static constexpr uint32_t FORMAT_10_4 = 104; + /** Encrypted MariaDB redo log */ + static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31; + /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED; + /** The MariaDB 10.5 physical redo log format */ + static constexpr uint32_t FORMAT_10_5 = 0x50485953; + /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED; + +private: + /** The log sequence number of the last change of durable InnoDB files */ + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) + std::atomic<lsn_t> lsn; + /** the first guaranteed-durable log sequence number */ + std::atomic<lsn_t> flushed_to_disk_lsn; + /** set when there may be need to flush the log buffer, or + preflush buffer pool pages, or initiate a log checkpoint. + This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ + std::atomic<bool> check_flush_or_checkpoint_; +public: + /** mutex protecting the log */ + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; + /** first free offset within the log buffer in use */ + size_t buf_free; + /** recommended maximum size of buf, after which the buffer is flushed */ + size_t max_buf_free; + /** mutex to serialize access to the flush list when we are putting + dirty blocks in the list. The idea behind this mutex is to be able + to release log_sys.mutex during mtr_commit and still ensure that + insertions in the flush_list happen in the LSN order. */ + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex; + /** log_buffer, append data here */ + byte *buf; + /** log_buffer, writing data to file from this buffer. + Before flushing write_buf is swapped with flush_buf */ + byte *flush_buf; + /** Log file stuff. Protected by mutex. */ + struct file { + /** format of the redo log: e.g., FORMAT_10_5 */ + uint32_t format; + /** redo log subformat: 0 with separately logged TRUNCATE, + 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ + uint32_t subformat; + /** individual log file size in bytes, including the header */ + lsn_t file_size; + private: + /** lsn used to fix coordinates within the log group */ + lsn_t lsn; + /** the byte offset of the above lsn */ + lsn_t lsn_offset; + /** log file */ + log_file_t fd; + + public: + /** used only in recovery: recovery scan succeeded up to this + lsn in this log group */ + lsn_t scanned_lsn; + + /** opens log file which must be closed prior this call */ + void open_file(std::string path); + /** writes header */ + void write_header_durable(lsn_t lsn); + /** opens log file which must be closed prior this call */ + dberr_t rename(std::string path) { return fd.rename(path); } + /** reads buffer from log file + @param[in] offset offset in log file + @param[in] buf buffer where to read */ + void read(os_offset_t offset, span<byte> buf); + /** Tells whether writes require calling flush() */ + bool writes_are_durable() const noexcept; + /** writes buffer to log file + @param[in] offset offset in log file + @param[in] buf buffer from which to write */ + void write(os_offset_t offset, span<byte> buf); + /** flushes OS page cache (excluding metadata!) for log file */ + void flush(); + /** closes log file */ + void close_file(); + + /** @return whether the redo log is encrypted */ + bool is_encrypted() const { return format & FORMAT_ENCRYPTED; } + /** @return whether the redo log is in the physical format */ + bool is_physical() const + { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; } + /** @return capacity in bytes */ + lsn_t capacity() const{ return file_size - LOG_FILE_HDR_SIZE; } + /** Calculate the offset of a log sequence number. + @param[in] lsn log sequence number + @return offset within the log */ + inline lsn_t calc_lsn_offset(lsn_t lsn) const; + inline lsn_t calc_lsn_offset_old(lsn_t lsn) const; + + /** Set the field values to correspond to a given lsn. */ + void set_fields(lsn_t lsn) + { + lsn_t c_lsn_offset = calc_lsn_offset(lsn); + set_lsn(lsn); + set_lsn_offset(c_lsn_offset); + } + + /** Read a log segment to log_sys.buf. + @param[in,out] start_lsn in: read area start, + out: the last read valid lsn + @param[in] end_lsn read area end + @return whether no invalid blocks (e.g checksum mismatch) were found */ + bool read_log_seg(lsn_t* start_lsn, lsn_t end_lsn); + + /** Initialize the redo log buffer. */ + void create(); + + /** Close the redo log buffer. */ + void close() { close_file(); } + void set_lsn(lsn_t a_lsn); + lsn_t get_lsn() const { return lsn; } + void set_lsn_offset(lsn_t a_lsn); + lsn_t get_lsn_offset() const { return lsn_offset; } + } log; + + /** The fields involved in the log buffer flush @{ */ + + size_t buf_next_to_write;/*!< first offset in the log buffer + where the byte content may not exist + written to file, e.g., the start + offset of a log record catenated + later; this is advanced when a flush + operation is completed to all the log + groups */ + lsn_t write_lsn; /*!< last written lsn */ + lsn_t current_flush_lsn;/*!< end lsn for the current running + write + flush operation */ + std::atomic<size_t> pending_flushes; /*!< system calls in progress */ + std::atomic<size_t> flushes; /*!< system calls counter */ + + ulint n_log_ios; /*!< number of log i/os initiated thus + far */ + ulint n_log_ios_old; /*!< number of log i/o's at the + previous printout */ + time_t last_printout_time;/*!< when log_print was last time + called */ + /* @} */ + + /** Fields involved in checkpoints @{ */ + lsn_t log_capacity; /*!< capacity of the log; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ + lsn_t max_modified_age_async; + /*!< when this recommended + value for lsn - + buf_pool.get_oldest_modification() + is exceeded, we start an + asynchronous preflush of pool pages */ + lsn_t max_checkpoint_age; + /*!< this is the maximum allowed value + for lsn - last_checkpoint_lsn when a + new query step is started */ + ib_uint64_t next_checkpoint_no; + /*!< next checkpoint number */ + /** latest completed checkpoint (protected by log_sys.mutex) */ + Atomic_relaxed<lsn_t> last_checkpoint_lsn; + lsn_t next_checkpoint_lsn; + /*!< next checkpoint lsn */ + ulint n_pending_checkpoint_writes; + /*!< number of currently pending + checkpoint writes */ + + /** buffer for checkpoint header */ + byte *checkpoint_buf; + /* @} */ + +private: + bool m_initialised; +public: + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + log_t(): m_initialised(false) {} + + /** @return whether the redo log is encrypted */ + bool is_encrypted() const { return(log.is_encrypted()); } + /** @return whether the redo log is in the physical format */ + bool is_physical() const { return log.is_physical(); } + + bool is_initialised() const { return m_initialised; } + + lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const + { return lsn.load(order); } + void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); } + + lsn_t get_flushed_lsn() const + { return flushed_to_disk_lsn.load(std::memory_order_acquire); } + void set_flushed_lsn(lsn_t lsn) + { flushed_to_disk_lsn.store(lsn, std::memory_order_release); } + + bool check_flush_or_checkpoint() const + { + return UNIV_UNLIKELY + (check_flush_or_checkpoint_.load(std::memory_order_relaxed)); + } + void set_check_flush_or_checkpoint(bool flag= true) + { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); } + + bool has_encryption_key_rotation() const { + return log.format == FORMAT_ENC_10_4 || log.format == FORMAT_ENC_10_5; + } + + /** @return the log block header + trailer size */ + unsigned framing_size() const + { + return has_encryption_key_rotation() + ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM + : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM; + } + /** @return the log block payload size */ + unsigned payload_size() const + { + return has_encryption_key_rotation() + ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM - + LOG_BLOCK_KEY + : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM; + } + /** @return the log block trailer offset */ + unsigned trailer_offset() const + { + return has_encryption_key_rotation() + ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY + : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; + } + + size_t get_pending_flushes() const + { + return pending_flushes.load(std::memory_order_relaxed); + } + + size_t get_flushes() const + { + return flushes.load(std::memory_order_relaxed); + } + + /** Initialise the redo log subsystem. */ + void create(); + + /** Shut down the redo log subsystem. */ + void close(); +}; + +/** Redo log system */ +extern log_t log_sys; +#ifdef UNIV_DEBUG +extern bool log_write_lock_own(); +#endif + +/** Calculate the offset of a log sequence number. +@param[in] lsn log sequence number +@return offset within the log */ +inline lsn_t log_t::file::calc_lsn_offset(lsn_t lsn) const +{ + ut_ad(this == &log_sys.log); + /* The lsn parameters are updated while holding both the mutexes + and it is ok to have either of them while reading */ +#ifdef SAFE_MUTEX + ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); +#endif /* SAFE_MUTEX */ + const lsn_t size = capacity(); + lsn_t l= lsn - this->lsn; + if (longlong(l) < 0) { + l = lsn_t(-longlong(l)) % size; + l = size - l; + } + + l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size); + l %= size; + return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE)); +} + +inline void log_t::file::set_lsn(lsn_t a_lsn) +{ +#ifdef SAFE_MUTEX + ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); +#endif /* SAFE_MUTEX */ + lsn= a_lsn; +} + +inline void log_t::file::set_lsn_offset(lsn_t a_lsn) +{ +#ifdef SAFE_MUTEX + ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); +#endif /* SAFE_MUTEX */ + ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE)); + lsn_offset= a_lsn; +} + +#include "log0log.ic" + +#endif diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic new file mode 100644 index 00000000..d503e3ff --- /dev/null +++ b/storage/innobase/include/log0log.ic @@ -0,0 +1,326 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.ic +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "assume_aligned.h" +#include "ut0crc32.h" + +extern ulong srv_log_buffer_size; + +/************************************************************//** +Gets a log block flush bit. +@return TRUE if this block was the first to be written in a log flush */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + const byte* log_block) /*!< in: log block */ +{ + static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); + static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility"); + + return *log_block & 0x80; +} + +/************************************************************//** +Sets the log block flush bit. */ +UNIV_INLINE +void +log_block_set_flush_bit( +/*====================*/ + byte* log_block, /*!< in/out: log block */ + ibool val) /*!< in: value to set */ +{ + static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); + static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility"); + + if (val) + *log_block|= 0x80; + else + *log_block&= 0x7f; +} + +/************************************************************//** +Gets a log block number stored in the header. +@return log block number stored in the block header */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + const byte* log_block) /*!< in: log block */ +{ + static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); + return mach_read_from_4(my_assume_aligned<4>(log_block)) & + ~LOG_BLOCK_FLUSH_BIT_MASK; +} + +/************************************************************//** +Sets the log block number stored in the header; NOTE that this must be set +before the flush bit! */ +UNIV_INLINE +void +log_block_set_hdr_no( +/*=================*/ + byte* log_block, /*!< in/out: log block */ + ulint n) /*!< in: log block number: must be > 0 and + < LOG_BLOCK_FLUSH_BIT_MASK */ +{ + static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); + ut_ad(n > 0); + ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK); + + mach_write_to_4(my_assume_aligned<4>(log_block), n); +} + +/************************************************************//** +Gets a log block data length. +@return log block data length measured as a byte offset from the block start */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + const byte* log_block) /*!< in: log block */ +{ + return mach_read_from_2(my_assume_aligned<2> + (log_block + LOG_BLOCK_HDR_DATA_LEN)); +} + +/************************************************************//** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint len) /*!< in: data length */ +{ + mach_write_to_2(my_assume_aligned<2>(log_block + LOG_BLOCK_HDR_DATA_LEN), + len); +} + +/************************************************************//** +Gets a log block first mtr log record group offset. +@return first mtr log record group byte offset from the block start, 0 +if none */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + const byte* log_block) /*!< in: log block */ +{ + return mach_read_from_2(my_assume_aligned<2> + (log_block + LOG_BLOCK_FIRST_REC_GROUP)); +} + +/************************************************************//** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /*!< in/out: log block */ + ulint offset) /*!< in: offset, 0 if none */ +{ + mach_write_to_2(my_assume_aligned<2> + (log_block + LOG_BLOCK_FIRST_REC_GROUP), offset); +} + +/************************************************************//** +Gets a log block checkpoint number field (4 lowest bytes). +@return checkpoint no (4 lowest bytes) */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + const byte* log_block) /*!< in: log block */ +{ + return mach_read_from_4(my_assume_aligned<4> + (log_block + LOG_BLOCK_CHECKPOINT_NO)); +} + +/************************************************************//** +Sets a log block checkpoint number field (4 lowest bytes). */ +UNIV_INLINE +void +log_block_set_checkpoint_no( +/*========================*/ + byte* log_block, /*!< in/out: log block */ + ib_uint64_t no) /*!< in: checkpoint no */ +{ + mach_write_to_4(my_assume_aligned<4>(log_block + LOG_BLOCK_CHECKPOINT_NO), + static_cast<uint32_t>(no)); +} + +/************************************************************//** +Converts a lsn to a log block number. +@return log block number, it is > 0 and <= 1G */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + lsn_t lsn) /*!< in: lsn of a byte within the block */ +{ + return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & + DBUG_EVALUATE_IF("innodb_small_log_block_no_limit", + 0xFUL, 0x3FFFFFFFUL)) + 1); +} + +/** Calculate the CRC-32C checksum of a log block. +@param[in] block log block +@return checksum */ +inline ulint log_block_calc_checksum_crc32(const byte* block) +{ + return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM); +} + +/************************************************************//** +Gets a log block checksum field value. +@return checksum */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + const byte* log_block) /*!< in: log block */ +{ + return mach_read_from_4(my_assume_aligned<4> + (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM + + log_block)); +} + +/************************************************************//** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint checksum) /*!< in: checksum */ +{ + mach_write_to_4(my_assume_aligned<4> + (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM + + log_block), checksum); +} + +/************************************************************//** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn) /*!< in: lsn within the log block */ +{ + ulint no; + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +/** Append a string to the log. +@param[in] str string +@param[in] len string length +@param[out] start_lsn start LSN of the log record +@return end lsn of the log record, zero if did not succeed */ +UNIV_INLINE +lsn_t +log_reserve_and_write_fast( + const void* str, + ulint len, + lsn_t* start_lsn) +{ + mysql_mutex_assert_owner(&log_sys.mutex); + ut_ad(len > 0); + + const ulint data_len = len + + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; + + if (data_len >= log_sys.trailer_offset()) { + + /* The string does not fit within the current log block + or the log block would become full */ + + return(0); + } + + lsn_t lsn = log_sys.get_lsn(); + *start_lsn = lsn; + + memcpy(log_sys.buf + log_sys.buf_free, str, len); + + log_block_set_data_len( + reinterpret_cast<byte*>(ut_align_down( + log_sys.buf + log_sys.buf_free, + OS_FILE_LOG_BLOCK_SIZE)), + data_len); + + log_sys.buf_free += len; + + ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size}); + + lsn += len; + log_sys.set_lsn(lsn); + + return lsn; +} + +/***********************************************************************//** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void) +/*================*/ +{ + /* During row_log_table_apply(), this function will be called while we + are holding some latches. This is OK, as long as we are not holding + any latches on buffer blocks. */ + +#ifdef UNIV_DEBUG + static const latch_level_t latches[] = { + SYNC_DICT, /* dict_sys.mutex during + commit_try_rebuild() */ + SYNC_DICT_OPERATION, /* dict_sys.latch X-latch during + commit_try_rebuild() */ + SYNC_FTS_CACHE, /* fts_cache_t::lock */ + SYNC_INDEX_TREE /* index->lock */ + }; +#endif /* UNIV_DEBUG */ + + ut_ad(!sync_check_iterate( + sync_allowed_latches(latches, + latches + UT_ARR_SIZE(latches)))); + + if (log_sys.check_flush_or_checkpoint()) { + + log_check_margins(); + } +} diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h new file mode 100644 index 00000000..f822a874 --- /dev/null +++ b/storage/innobase/include/log0recv.h @@ -0,0 +1,426 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.h +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "ut0byte.h" +#include "buf0types.h" +#include "log0log.h" +#include "mtr0types.h" + +#include <deque> + +/** @return whether recovery is currently running. */ +#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) + +/** Find the latest checkpoint in the log header. +@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 +@return error code or DB_SUCCESS */ +dberr_t +recv_find_max_checkpoint(ulint* max_field) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Apply any buffered redo log to a page that was just read from a data file. +@param[in,out] space tablespace +@param[in,out] bpage buffer pool page */ +ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage) + MY_ATTRIBUTE((nonnull)); + +/** Start recovering from a redo log checkpoint. +@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN +of first system tablespace page +@return error code or DB_SUCCESS */ +dberr_t +recv_recovery_from_checkpoint_start( + lsn_t flush_lsn); + +/** Whether to store redo log records in recv_sys.pages */ +enum store_t { + /** Do not store redo log records. */ + STORE_NO, + /** Store redo log records. */ + STORE_YES, + /** Store redo log records if the tablespace exists. */ + STORE_IF_EXISTS +}; + + +/** Adds data from a new log block to the parsing buffer of recv_sys if +recv_sys.parse_start_lsn is non-zero. +@param[in] log_block log block to add +@param[in] scanned_lsn lsn of how far we were able to find + data in this log block +@return true if more data added */ +bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn); + +/** Moves the parsing buffer data left to the buffer start */ +void recv_sys_justify_left_parsing_buf(); + +/** Report an operation to create, delete, or rename a file during backup. +@param[in] space_id tablespace identifier +@param[in] create whether the file is being created +@param[in] name file name (not NUL-terminated) +@param[in] len length of name, in bytes +@param[in] new_name new file name (NULL if not rename) +@param[in] new_len length of new_name, in bytes (0 if NULL) */ +extern void (*log_file_op)(ulint space_id, bool create, + const byte* name, ulint len, + const byte* new_name, ulint new_len); + +/** Stored redo log record */ +struct log_rec_t +{ + log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); } + log_rec_t()= delete; + log_rec_t(const log_rec_t&)= delete; + log_rec_t &operator=(const log_rec_t&)= delete; + + /** next record */ + log_rec_t *next; + /** mtr_t::commit_lsn() of the mini-transaction */ + const lsn_t lsn; +}; + +struct recv_dblwr_t +{ + /** Add a page frame to the doublewrite recovery buffer. */ + void add(byte *page) { pages.push_front(page); } + + /** Validate the page. + @param page_id page identifier + @param page page contents + @param space the tablespace of the page (not available for page 0) + @param tmp_buf 2*srv_page_size for decrypting and decompressing any + page_compressed or encrypted pages + @return whether the page is valid */ + bool validate_page(const page_id_t page_id, const byte *page, + const fil_space_t *space, byte *tmp_buf); + + /** Find a doublewrite copy of a page. + @param page_id page identifier + @param space tablespace (not available for page_id.page_no()==0) + @param tmp_buf 2*srv_page_size for decrypting and decompressing any + page_compressed or encrypted pages + @return page frame + @retval NULL if no valid page for page_id was found */ + byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL, + byte *tmp_buf= NULL); + + typedef std::deque<byte*, ut_allocator<byte*> > list; + + /** Recovered doublewrite buffer page frames */ + list pages; +}; + +/** the recovery state and buffered records for a page */ +struct page_recv_t +{ + /** Recovery state; protected by recv_sys.mutex */ + enum + { + /** not yet processed */ + RECV_NOT_PROCESSED, + /** not processed; the page will be reinitialized */ + RECV_WILL_NOT_READ, + /** page is being read */ + RECV_BEING_READ, + /** log records are being applied on the page */ + RECV_BEING_PROCESSED + } state= RECV_NOT_PROCESSED; + /** Latest written byte offset when applying the log records. + @see mtr_t::m_last_offset */ + uint16_t last_offset= 1; + /** log records for a page */ + class recs_t + { + /** The first log record */ + log_rec_t *head= nullptr; + /** The last log record */ + log_rec_t *tail= nullptr; + friend struct page_recv_t; + public: + /** Append a redo log snippet for the page + @param recs log snippet */ + void append(log_rec_t* recs) + { + if (tail) + tail->next= recs; + else + head= recs; + tail= recs; + } + + /** @return the last log snippet */ + const log_rec_t* last() const { return tail; } + /** @return the last log snippet */ + log_rec_t* last() { return tail; } + + class iterator + { + log_rec_t *cur; + public: + iterator(log_rec_t* rec) : cur(rec) {} + log_rec_t* operator*() const { return cur; } + iterator &operator++() { cur= cur->next; return *this; } + bool operator!=(const iterator& i) const { return cur != i.cur; } + }; + iterator begin() { return head; } + iterator end() { return NULL; } + bool empty() const { ut_ad(!head == !tail); return !head; } + /** Clear and free the records; @see recv_sys_t::alloc() */ + inline void clear(); + } log; + + /** Trim old log records for a page. + @param start_lsn oldest log sequence number to preserve + @return whether all the log for the page was trimmed */ + inline bool trim(lsn_t start_lsn); + /** Ignore any earlier redo log records for this page. */ + inline void will_not_read(); + /** @return whether the log records for the page are being processed */ + bool is_being_processed() const { return state == RECV_BEING_PROCESSED; } +}; + +/** Recovery system data structure */ +struct recv_sys_t +{ + /** mutex protecting apply_log_recs and page_recv_t::state */ + ib_mutex_t mutex; + /** whether we are applying redo log records during crash recovery */ + bool recovery_on; + /** whether recv_recover_page(), invoked from buf_page_read_complete(), + should apply log records*/ + bool apply_log_recs; + /** whether recv_apply_hashed_log_recs() is running */ + bool apply_batch_on; + byte* buf; /*!< buffer for parsing log records */ + ulint len; /*!< amount of data in buf */ + lsn_t parse_start_lsn; + /*!< this is the lsn from which we were able to + start parsing log records and adding them to + pages; zero if a suitable + start point not found yet */ + lsn_t scanned_lsn; + /*!< the log data has been scanned up to this + lsn */ + ulint scanned_checkpoint_no; + /*!< the log data has been scanned up to this + checkpoint number (lowest 4 bytes) */ + ulint recovered_offset; + /*!< start offset of non-parsed log records in + buf */ + lsn_t recovered_lsn; + /*!< the log records have been parsed up to + this lsn */ + bool found_corrupt_log; + /*!< set when finding a corrupt log + block or record, or there is a log + parsing buffer overflow */ + bool found_corrupt_fs; + /*!< set when an inconsistency with + the file system contents is detected + during log scan or apply */ + lsn_t mlog_checkpoint_lsn; + /*!< the LSN of a FILE_CHECKPOINT + record, or 0 if none was parsed */ + /** the time when progress was last reported */ + time_t progress_time; + + using map = std::map<const page_id_t, page_recv_t, + std::less<const page_id_t>, + ut_allocator<std::pair<const page_id_t, page_recv_t>>>; + /** buffered records waiting to be applied to pages */ + map pages; + +private: + /** Process a record that indicates that a tablespace size is being shrunk. + @param page_id first page that is not in the file + @param lsn log sequence number of the shrink operation */ + inline void trim(const page_id_t page_id, lsn_t lsn); + + /** Undo tablespaces for which truncate has been logged + (indexed by page_id_t::space() - srv_undo_space_id_start) */ + struct trunc + { + /** log sequence number of FILE_CREATE, or 0 if none */ + lsn_t lsn; + /** truncated size of the tablespace, or 0 if not truncated */ + unsigned pages; + } truncated_undo_spaces[127]; + +public: + /** The contents of the doublewrite buffer */ + recv_dblwr_t dblwr; + + /** Last added LSN to pages. */ + lsn_t last_stored_lsn= 0; + + void read(os_offset_t offset, span<byte> buf); + inline size_t files_size(); + void close_files() { files.clear(); files.shrink_to_fit(); } + +private: + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @param p iterator pointing to page_id + @param mtr mini-transaction + @param b pre-allocated buffer pool block + @return whether the page was successfully initialized */ + inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p, + mtr_t &mtr, buf_block_t *b); + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records */ + buf_block_t *recover_low(const page_id_t page_id); + + /** All found log files (multiple ones are possible if we are upgrading + from before MariaDB Server 10.5.1) */ + std::vector<log_file_t> files; + + void open_log_files_if_needed(); + + /** Base node of the redo block list. + List elements are linked via buf_block_t::unzip_LRU. */ + UT_LIST_BASE_NODE_T(buf_block_t) blocks; +public: + /** Check whether the number of read redo log blocks exceeds the maximum. + Store last_stored_lsn if the recovery is not in the last phase. + @param[in,out] store whether to store page operations + @return whether the memory is exhausted */ + inline bool is_memory_exhausted(store_t *store); + /** Apply buffered log to persistent data pages. + @param last_batch whether it is possible to write more redo log */ + void apply(bool last_batch); + +#ifdef UNIV_DEBUG + /** whether all redo log in the current batch has been applied */ + bool after_apply= false; +#endif + /** Initialize the redo log recovery subsystem. */ + void create(); + + /** Free most recovery data structures. */ + void debug_free(); + + /** Clean up after create() */ + void close(); + + bool is_initialised() const { return last_stored_lsn != 0; } + + /** Register a redo log snippet for a page. + @param page_id page identifier + @param start_lsn start LSN of the mini-transaction + @param lsn @see mtr_t::commit_lsn() + @param l redo log snippet @see log_t::FORMAT_10_5 + @param len length of l, in bytes */ + inline void add(const page_id_t page_id, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len); + + /** Parse and register one mini-transaction in log_t::FORMAT_10_5. + @param checkpoint_lsn the log sequence number of the latest checkpoint + @param store whether to store the records + @param apply whether to apply file-level log records + @return whether FILE_CHECKPOINT record was seen the first time, + or corruption was noticed */ + bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply); + + /** Clear a fully processed set of stored redo log records. */ + inline void clear(); + + /** Determine whether redo log recovery progress should be reported. + @param time the current time + @return whether progress should be reported + (the last report was at least 15 seconds ago) */ + bool report(time_t time) + { + if (time - progress_time < 15) + return false; + + progress_time= time; + return true; + } + + /** The alloc() memory alignment, in bytes */ + static constexpr size_t ALIGNMENT= sizeof(size_t); + + /** Allocate memory for log_rec_t + @param len allocation size, in bytes + @return pointer to len bytes of memory (never NULL) */ + inline void *alloc(size_t len); + + /** Free a redo log snippet. + @param data buffer returned by alloc() */ + inline void free(const void *data); + + /** Remove records for a corrupted page. + This function should only be called when innodb_force_recovery is set. + @param page_id corrupted page identifier */ + ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id); + + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records */ + buf_block_t *recover(const page_id_t page_id) + { + return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr; + } +}; + +/** The recovery system */ +extern recv_sys_t recv_sys; + +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this will be set if +recv_sys.pages becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +TRUE means that recovery is running and no operations on the log files +are allowed yet: the variable name is misleading. */ +extern bool recv_no_ibuf_operations; +/** TRUE when recv_init_crash_recovery() has been called. */ +extern bool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** TRUE if writing to the redo log (mtr_commit) is forbidden. +Protected by log_sys.mutex. */ +extern bool recv_no_log_write; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start(). */ +extern bool recv_lsn_checks_on; + +/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many +times! */ +#define RECV_PARSING_BUF_SIZE (2U << 20) + +/** Size of block reads when the log groups are scanned forward to do a +roll-forward */ +#define RECV_SCAN_SIZE (4U << srv_page_size_shift) diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h new file mode 100644 index 00000000..337fcd31 --- /dev/null +++ b/storage/innobase/include/log0types.h @@ -0,0 +1,44 @@ +/***************************************************************************** + +Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0types.h +Log types + +Created 2013-03-15 Sunny Bains +*******************************************************/ + +#ifndef log0types_h +#define log0types_h + +#include "univ.i" + +/* Type used for all log sequence number storage and arithmetics */ +typedef ib_uint64_t lsn_t; + +#define LSN_MAX IB_UINT64_MAX + +#define LSN_PF UINT64PF + +#endif /* log0types_h */ diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h new file mode 100644 index 00000000..88317a73 --- /dev/null +++ b/storage/innobase/include/mach0data.h @@ -0,0 +1,353 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.h +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef mach0data_h +#define mach0data_h + +#include "univ.i" +#include "mtr0types.h" + +#ifndef UNIV_INNOCHECKSUM + +/* The data and all fields are always stored in a database file +in the same format: ascii, big-endian, ... . +All data in the files MUST be accessed using the functions in this +module. */ + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */ +/** The following function is used to fetch data from one byte. +@param[in] b pointer to a byte to read +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +uint8_t +mach_read_from_1( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lower address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */ +#endif /* !UNIV_INNOCHECKSUM */ +/** The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 2 bytes where to store +@return 2-byte integer, >= 0, < 64k */ +UNIV_INLINE +uint16_t +mach_read_from_2( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); + +#ifndef UNIV_INNOCHECKSUM +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ + MY_ATTRIBUTE((const)); +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ + MY_ATTRIBUTE((const)); +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/** The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 3 bytes to read +@return 32 bit integer */ +UNIV_INLINE +uint32_t +mach_read_from_3( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/** The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 4 bytes to read +@return 32 bit integer */ +UNIV_INLINE +uint32_t +mach_read_from_4( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a ulint in a compressed form (1..5 bytes). +@return stored size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/*********************************************************//** +Returns the size of an ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer to be stored */ + MY_ATTRIBUTE((const)); +/** Read a 32-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint32_t +mach_read_next_compressed( + const byte** b); +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t id); /*!< in: 48-bit integer */ +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n); /*!< in: 56-bit integer */ +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/** Read a 64-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_next_compressed( + const byte** b); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d); /*!< in: double */ +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d); /*!< in: float */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type); /*!< in: signed or unsigned flag */ + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign); /*!< in: signed or unsigned flag */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#include "mach0data.ic" + +#endif diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic new file mode 100644 index 00000000..bfccf611 --- /dev/null +++ b/storage/innobase/include/mach0data.ic @@ -0,0 +1,836 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.ic +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +#include "mtr0types.h" + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */ +{ + ut_ad((n & ~0xFFUL) == 0); + + b[0] = (byte) n; +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad((n & ~0xFFFFUL) == 0); + + b[0] = (byte)(n >> 8); + b[1] = (byte)(n); +} + +/** The following function is used to fetch data from one byte. +@param[in] b pointer to a byte to read +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +uint8_t +mach_read_from_1( + const byte* b) +{ + return(uint8_t(*b)); +} + +/** The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 2 bytes to read +@return 2-byte integer, >= 0, < 64k */ +UNIV_INLINE +uint16_t +mach_read_from_2( + const byte* b) +{ + return(uint16_t(uint16_t(b[0]) << 8 | b[1])); +} + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ +{ + uint16 ret; + ut_ad(2 == sizeof ret); + mach_write_to_2((byte*) &ret, n); + return(ret); +} +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ +{ + ut_ad(2 == sizeof n); + return(mach_read_from_2((const byte*) &n)); +} + +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad((n & ~0xFFFFFFUL) == 0); + + b[0] = (byte)(n >> 16); + b[1] = (byte)(n >> 8); + b[2] = (byte)(n); +} + +/** The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 3 bytes to read +@return uint32_t integer */ +UNIV_INLINE +uint32_t +mach_read_from_3( + const byte* b) +{ + return( (static_cast<uint32_t>(b[0]) << 16) + | (static_cast<uint32_t>(b[1]) << 8) + | static_cast<uint32_t>(b[2]) + ); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + b[0] = (byte)(n >> 24); + b[1] = (byte)(n >> 16); + b[2] = (byte)(n >> 8); + b[3] = (byte) n; +} + +/** The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 4 bytes to read +@return 32 bit integer */ +UNIV_INLINE +uint32_t +mach_read_from_4( + const byte* b) +{ + return( (static_cast<uint32_t>(b[0]) << 24) + | (static_cast<uint32_t>(b[1]) << 16) + | (static_cast<uint32_t>(b[2]) << 8) + | static_cast<uint32_t>(b[3]) + ); +} + +#ifndef UNIV_INNOCHECKSUM + +/*********************************************************//** +Writes a ulint in a compressed form where the first byte codes the +length of the stored ulint. We look at the most significant bits of +the byte. If the most significant bit is zero, it means 1-byte storage, +else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0, +it means 3-byte storage, else if 4th is 0, it means 4-byte storage, +else the storage is 5-byte. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80) { + /* 0nnnnnnn (7 bits) */ + mach_write_to_1(b, n); + return(1); + } else if (n < 0x4000) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + mach_write_to_2(b, n | 0x8000); + return(2); + } else if (n < 0x200000) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + mach_write_to_3(b, n | 0xC00000); + return(3); + } else if (n < 0x10000000) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + mach_write_to_4(b, n | 0xE0000000); + return(4); + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + mach_write_to_1(b, 0xF0); + mach_write_to_4(b + 1, n); + return(5); + } +} + +/*********************************************************//** +Returns the size of a ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80) { + /* 0nnnnnnn (7 bits) */ + return(1); + } else if (n < 0x4000) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + return(2); + } else if (n < 0x200000) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + return(3); + } else if (n < 0x10000000) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + return(4); + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + return(5); + } +} + +/*********************************************************//** +Reads a ulint in a compressed form. +@return read integer (< 2^32) */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ulint val; + + val = mach_read_from_1(b); + + if (val < 0x80) { + /* 0nnnnnnn (7 bits) */ + } else if (val < 0xC0) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + val = mach_read_from_2(b) & 0x3FFF; + ut_ad(val > 0x7F); + } else if (val < 0xE0) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + val = mach_read_from_3(b) & 0x1FFFFF; + ut_ad(val > 0x3FFF); + } else if (val < 0xF0) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + val = mach_read_from_4(b) & 0xFFFFFFF; + ut_ad(val > 0x1FFFFF); + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + ut_ad(val == 0xF0); + val = mach_read_from_4(b + 1); + ut_ad(val > 0xFFFFFFF); + } + + return(val); +} + +/** Read a 32-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint32_t +mach_read_next_compressed( + const byte** b) +{ + ulint val = mach_read_from_1(*b); + + if (val < 0x80) { + /* 0nnnnnnn (7 bits) */ + ++*b; + } else if (val < 0xC0) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + val = mach_read_from_2(*b) & 0x3FFF; + ut_ad(val > 0x7F); + *b += 2; + } else if (val < 0xE0) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + val = mach_read_from_3(*b) & 0x1FFFFF; + ut_ad(val > 0x3FFF); + *b += 3; + } else if (val < 0xF0) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + val = mach_read_from_4(*b) & 0xFFFFFFF; + ut_ad(val > 0x1FFFFF); + *b += 4; + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + ut_ad(val == 0xF0); + val = mach_read_from_4(*b + 1); + ut_ad(val > 0xFFFFFFF); + *b += 5; + } + + return(static_cast<ib_uint32_t>(val)); +} + +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32)); + mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ +{ + ib_uint64_t u64; + + u64 = mach_read_from_4(b); + u64 <<= 32; + u64 |= mach_read_from_4(b + 4); + + return(u64); +} + +#ifndef UNIV_INNOCHECKSUM + +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n) /*!< in: 56-bit integer */ +{ + mach_write_to_3(b, (ulint) (n >> 32)); + mach_write_to_4(b + 3, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ +{ + return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3))); +} + +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t n) /*!< in: 48-bit integer */ +{ + mach_write_to_2(b, (ulint) (n >> 32)); + mach_write_to_4(b + 2, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ +{ + return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2))); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size = mach_write_compressed(b, (ulint) (n >> 32)); + mach_write_to_4(b + size, (ulint) n); + + return(size + 4); +} + +/** Read a 64-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_next_compressed( + const byte** b) +{ + ib_uint64_t val; + + val = mach_read_next_compressed(b); + val <<= 32; + val |= mach_read_from_4(*b); + *b += 4; + return(val); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size; + + if (!(n >> 32)) { + return(mach_write_compressed(b, (ulint) n)); + } + + *b = (byte)0xFF; + size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32)); + + size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF); + + return(size); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ib_uint64_t n; + + if (*b != 0xFF) { + return(mach_read_compressed(b)); + } + + b++; + n = mach_read_next_compressed(&b); + n <<= 32; + n |= mach_read_compressed(b); + + return(n); +} + +/** Read a 64-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint64_t +mach_read_next_much_compressed( + const byte** b) +{ + ib_uint64_t val = mach_read_from_1(*b); + + if (val < 0x80) { + /* 0nnnnnnn (7 bits) */ + ++*b; + } else if (val < 0xC0) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + val = mach_read_from_2(*b) & 0x3FFF; + ut_ad(val > 0x7F); + *b += 2; + } else if (val < 0xE0) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + val = mach_read_from_3(*b) & 0x1FFFFF; + ut_ad(val > 0x3FFF); + *b += 3; + } else if (val < 0xF0) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + val = mach_read_from_4(*b) & 0xFFFFFFF; + ut_ad(val > 0x1FFFFF); + *b += 4; + } else if (val == 0xF0) { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + val = mach_read_from_4(*b + 1); + ut_ad(val > 0xFFFFFFF); + *b += 5; + } else { + /* 11111111 followed by up to 64 bits */ + ut_ad(val == 0xFF); + ++*b; + val = mach_read_next_compressed(b); + ut_ad(val > 0); + val <<= 32; + val |= mach_read_next_compressed(b); + } + + return(val); +} + +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + double d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(double) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d) /*!< in: double */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(double) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + float d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(float) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d) /*!< in: float */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(float) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ +{ + ulint n = 0; + const byte* ptr; + + ut_ad(buf_size > 0); + + ptr = buf + buf_size; + + for (;;) { + ptr--; + + n = n << 8; + + n += (ulint)(*ptr); + + if (ptr == buf) { + break; + } + } + + return(n); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + byte* end; + + ut_ad(dest_size <= sizeof(ulint)); + ut_ad(dest_size > 0); + + end = dest + dest_size; + + for (;;) { + *dest = (byte)(n & 0xFF); + + n = n >> 8; + + dest++; + + if (dest == end) { + break; + } + } + + ut_ad(n == 0); +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ +{ + return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8)); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + ut_ad(n < 256 * 256); + + *dest = (byte)(n & 0xFFUL); + + n = n >> 8; + dest++; + + *dest = (byte)(n & 0xFFUL); +} + +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + /* XXX this can be optimized on big-endian machines */ + + uintmax_t ret; + uint i; + + if (unsigned_type || (src[0] & 0x80)) { + + ret = 0x0000000000000000ULL; + } else { + + ret = 0xFFFFFFFFFFFFFF00ULL; + } + + if (unsigned_type) { + + ret |= src[0]; + } else { + + ret |= src[0] ^ 0x80; + } + + for (i = 1; i < len; i++) { + ret <<= 8; + ret |= src[i]; + } + + return(ret); +} +/*********************************************************//** +Swap byte ordering. */ +UNIV_INLINE +void +mach_swap_byte_order( +/*=================*/ + byte* dest, /*!< out: where to write */ + const byte* from, /*!< in: where to read from */ + ulint len) /*!< in: length of src */ +{ + ut_ad(len > 0); + ut_ad(len <= 8); + + dest += len; + + switch (len & 0x7) { + case 0: *--dest = *from++; /* fall through */ + case 7: *--dest = *from++; /* fall through */ + case 6: *--dest = *from++; /* fall through */ + case 5: *--dest = *from++; /* fall through */ + case 4: *--dest = *from++; /* fall through */ + case 3: *--dest = *from++; /* fall through */ + case 2: *--dest = *from++; /* fall through */ + case 1: *--dest = *from; + } +} + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign) /*!< in: signed or unsigned flag */ +{ + byte* ptr = reinterpret_cast<byte*>(&src); + + ut_ad(len <= sizeof(ulonglong)); + +#ifdef WORDS_BIGENDIAN + memcpy(dest, ptr + (sizeof(src) - len), len); +#else + mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h new file mode 100644 index 00000000..b7fd9c09 --- /dev/null +++ b/storage/innobase/include/mem0mem.h @@ -0,0 +1,345 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0mem.h +The memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0mem_h +#define mem0mem_h + +#include "ut0mem.h" +#include "ut0rnd.h" +#include "mach0data.h" + +#include <memory> + +/* -------------------- MEMORY HEAPS ----------------------------- */ + +/** A block of a memory heap consists of the info structure +followed by an area of memory */ +typedef struct mem_block_info_t mem_block_t; + +/** A memory heap is a nonempty linear list of memory blocks */ +typedef mem_block_t mem_heap_t; + +/** Types of allocation for memory heaps: DYNAMIC means allocation from the +dynamic memory pool of the C compiler, BUFFER means allocation from the +buffer pool; the latter method is used for very big heaps */ + +#define MEM_HEAP_DYNAMIC 0 /* the most common type */ +#define MEM_HEAP_BUFFER 1 +#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be + ORed to MEM_HEAP_BUFFER, in which + case heap->free_block is used in + some cases for memory allocations, + and if it's NULL, the memory + allocation functions can return + NULL. */ + +/** Different type of heaps in terms of which datastructure is using them */ +#define MEM_HEAP_FOR_BTR_SEARCH (MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER) +#define MEM_HEAP_FOR_LOCK_HEAP (MEM_HEAP_BUFFER) + +/** The following start size is used for the first block in the memory heap if +the size is not specified, i.e., 0 is given as the parameter in the call of +create. The standard size is the maximum (payload) size of the blocks used for +allocations of small buffers. */ + +#define MEM_BLOCK_START_SIZE 64 +#define MEM_BLOCK_STANDARD_SIZE \ + (srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) + +/** If a memory heap is allowed to grow into the buffer pool, the following +is the maximum size for a single allocated buffer: */ +#define MEM_MAX_ALLOC_IN_BUF (srv_page_size - 200 + REDZONE_SIZE) + +/** Space needed when allocating for a user a field of length N. +The space is allocated only in multiples of UNIV_MEM_ALIGNMENT. */ +#define MEM_SPACE_NEEDED(N) UT_CALC_ALIGN((N), UNIV_MEM_ALIGNMENT) + +#ifdef UNIV_DEBUG +/** Macro for memory heap creation. +@param[in] size Desired start block size. */ +# define mem_heap_create(size) \ + mem_heap_create_func((size), __FILE__, __LINE__, MEM_HEAP_DYNAMIC) + +/** Macro for memory heap creation. +@param[in] size Desired start block size. +@param[in] type Heap type */ +# define mem_heap_create_typed(size, type) \ + mem_heap_create_func((size), __FILE__, __LINE__, (type)) + +#else /* UNIV_DEBUG */ +/** Macro for memory heap creation. +@param[in] size Desired start block size. */ +# define mem_heap_create(size) mem_heap_create_func((size), MEM_HEAP_DYNAMIC) + +/** Macro for memory heap creation. +@param[in] size Desired start block size. +@param[in] type Heap type */ +# define mem_heap_create_typed(size, type) \ + mem_heap_create_func((size), (type)) + +#endif /* UNIV_DEBUG */ + +/** Creates a memory heap. +NOTE: Use the corresponding macros instead of this function. +A single user buffer of 'size' will fit in the block. +0 creates a default size block. +@param[in] size Desired start block size. +@param[in] file_name File name where created +@param[in] line Line where created +@param[in] type Heap type +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( + ulint size, +#ifdef UNIV_DEBUG + const char* file_name, + unsigned line, +#endif /* UNIV_DEBUG */ + ulint type); + +/** Frees the space occupied by a memory heap. +NOTE: Use the corresponding macro instead of this function. +@param[in] heap Heap to be freed */ +UNIV_INLINE +void +mem_heap_free( + mem_heap_t* heap); + +/** Allocates and zero-fills n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( + mem_heap_t* heap, + ulint n); + +/** Allocates n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( + mem_heap_t* heap, + ulint n); + +/** Returns a pointer to the heap top. +@param[in] heap memory heap +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( + mem_heap_t* heap); + +/** Frees the space in a memory heap exceeding the pointer given. +The pointer must have been acquired from mem_heap_get_heap_top. +The first memory block of the heap is not freed. +@param[in] heap heap from which to free +@param[in] old_top pointer to old top of heap */ +UNIV_INLINE +void +mem_heap_free_heap_top( + mem_heap_t* heap, + byte* old_top); + +/** Empties a memory heap. +The first memory block of the heap is not freed. +@param[in] heap heap to empty */ +UNIV_INLINE +void +mem_heap_empty( + mem_heap_t* heap); + +/** Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. +@param[in] heap memory heap +@param[in] n size of the topmost element +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( + mem_heap_t* heap, + ulint n); + +/*****************************************************************//** +Frees the topmost element in a memory heap. +The size of the element must be given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: size of the topmost element */ +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap); /*!< in: heap */ + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str); /*!< in: string to be copied */ +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len); /*!< in: length of str, in bytes */ + +/** Duplicate a block of data, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] data block of data to be copied +@param[in] len length of data, in bytes +@return own: a copy of data */ +inline +void* +mem_heap_dup(mem_heap_t* heap, const void* data, size_t len) +{ + ut_ad(data || !len); + return UNIV_LIKELY(data != NULL) + ? memcpy(mem_heap_alloc(heap, len), data, len) + : NULL; +} + +/** Duplicate a NUL-terminated string, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] str string to be copied +@return own: a copy of the string */ +inline +char* +mem_heap_strdup(mem_heap_t* heap, const char* str) +{ + return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1))); +} + +/** Duplicate a string, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] str string to be copied +@param[in] len length of str, in bytes +@return own: a NUL-terminated copy of str */ +inline +char* +mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len) +{ + char* s = static_cast<char*>(mem_heap_alloc(heap, len + 1)); + s[len] = 0; + return(static_cast<char*>(memcpy(s, str, len))); +} + +/**********************************************************************//** +Concatenate two strings and return the result, using a memory heap. +@return own: the result */ +char* +mem_heap_strcat( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* s1, /*!< in: string 1 */ + const char* s2); /*!< in: string 2 */ + +/****************************************************************//** +A simple sprintf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). +@return heap-allocated formatted string */ +char* +mem_heap_printf( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* format, /*!< in: format string */ + ...) MY_ATTRIBUTE ((format (printf, 2, 3))); + +#ifdef UNIV_DEBUG +/** Validates the contents of a memory heap. +Asserts that the memory heap is consistent +@param[in] heap Memory heap to validate */ +void +mem_heap_validate( + const mem_heap_t* heap); + +#endif /* UNIV_DEBUG */ + +/*#######################################################################*/ + +/** The info structure stored at the beginning of a heap block */ +struct mem_block_info_t { +#ifdef UNIV_DEBUG + char file_name[8];/* file name where the mem heap was created */ + unsigned line; /*!< line number where the mem heap was created */ +#endif /* UNIV_DEBUG */ + UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the + the list this is the base node of the list of blocks; + in subsequent blocks this is undefined */ + UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next + and prev in the list. The first block allocated + to the heap is also the first block in this list, + though it also contains the base node of the list. */ + ulint len; /*!< physical length of this block in bytes */ + ulint total_size; /*!< physical length in bytes of all blocks + in the heap. This is defined only in the base + node and is set to ULINT_UNDEFINED in others. */ + ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or + MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */ + ulint free; /*!< offset in bytes of the first free position for + user data in the block */ + ulint start; /*!< the value of the struct field 'free' at the + creation of the block */ + + void* free_block; + /* if the MEM_HEAP_BTR_SEARCH bit is set in type, + and this is the heap root, this can contain an + allocated buffer frame, which can be appended as a + free block to the heap, if we need more space; + otherwise, this is NULL */ + void* buf_block; + /* if this block has been allocated from the buffer + pool, this contains the buf_block_t handle; + otherwise, this is NULL */ +}; + +/* Header size for a memory heap block */ +#define MEM_BLOCK_HEADER_SIZE UT_CALC_ALIGN(sizeof(mem_block_info_t),\ + UNIV_MEM_ALIGNMENT) + +#include "mem0mem.ic" +#endif diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic new file mode 100644 index 00000000..9236bbef --- /dev/null +++ b/storage/innobase/include/mem0mem.ic @@ -0,0 +1,466 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0mem.ic +The memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_DEBUG +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, file_name, line, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC) +#else /* UNIV_DEBUG */ +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, MEM_HEAP_DYNAMIC) +#endif /* UNIV_DEBUG */ +/***************************************************************//** +Creates a memory heap block where data can be allocated. +@return own: memory heap block, NULL if did not succeed (only possible +for MEM_HEAP_BTR_SEARCH type heaps) */ +mem_block_t* +mem_heap_create_block_func( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap or NULL if first block + should be created */ + ulint n, /*!< in: number of bytes needed for user data */ +#ifdef UNIV_DEBUG + const char* file_name,/*!< in: file name where created */ + unsigned line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type); /*!< in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ + +/******************************************************************//** +Frees a block from a memory heap. */ +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /*!< in: heap */ + mem_block_t* block); /*!< in: block to free */ + +/******************************************************************//** +Frees the free_block field from a memory heap. */ +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap); /*!< in: heap */ + +/***************************************************************//** +Adds a new block to a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes needed +@return created block, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +mem_block_t* +mem_heap_add_block( + mem_heap_t* heap, + ulint n); + +UNIV_INLINE +void +mem_block_set_len(mem_block_t* block, ulint len) +{ + ut_ad(len > 0); + + block->len = len; +} + +UNIV_INLINE +ulint +mem_block_get_len(mem_block_t* block) +{ + return(block->len); +} + +UNIV_INLINE +void +mem_block_set_type(mem_block_t* block, ulint type) +{ + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + block->type = type; +} + +UNIV_INLINE +ulint +mem_block_get_type(mem_block_t* block) +{ + return(block->type); +} + +UNIV_INLINE +void +mem_block_set_free(mem_block_t* block, ulint free) +{ + ut_ad(free > 0); + ut_ad(free <= mem_block_get_len(block)); + + block->free = free; +} + +UNIV_INLINE +ulint +mem_block_get_free(mem_block_t* block) +{ + return(block->free); +} + +UNIV_INLINE +void +mem_block_set_start(mem_block_t* block, ulint start) +{ + ut_ad(start > 0); + + block->start = start; +} + +UNIV_INLINE +ulint +mem_block_get_start(mem_block_t* block) +{ + return(block->start); +} + +/** Allocates and zero-fills n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( + mem_heap_t* heap, + ulint n) +{ + ut_ad(heap); + ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH)); + return(memset(mem_heap_alloc(heap, n), 0, n)); +} + +/** Allocates n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( + mem_heap_t* heap, + ulint n) +{ + mem_block_t* block; + byte* buf; + ulint free; + + block = UT_LIST_GET_LAST(heap->base); + + n += REDZONE_SIZE; + + ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF)); + + /* Check if there is enough space in block. If not, create a new + block to the heap */ + + if (mem_block_get_len(block) + < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) { + + block = mem_heap_add_block(heap, n); + + if (block == NULL) { + + return(NULL); + } + } + + free = mem_block_get_free(block); + + buf = (byte*) block + free; + + mem_block_set_free(block, free + MEM_SPACE_NEEDED(n)); + + buf = buf + REDZONE_SIZE; + MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE); + return(buf); +} + +/** Returns a pointer to the heap top. +@param[in] heap memory heap +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( + mem_heap_t* heap) +{ + mem_block_t* block; + byte* buf; + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block); + + return(buf); +} + +/** Frees the space in a memory heap exceeding the pointer given. +The pointer must have been acquired from mem_heap_get_heap_top. +The first memory block of the heap is not freed. +@param[in] heap heap from which to free +@param[in] old_top pointer to old top of heap */ +UNIV_INLINE +void +mem_heap_free_heap_top( + mem_heap_t* heap, + byte* old_top) +{ + mem_block_t* block; + mem_block_t* prev_block; + + ut_d(mem_heap_validate(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + while (block != NULL) { + if (((byte*) block + mem_block_get_free(block) >= old_top) + && ((byte*) block <= old_top)) { + /* Found the right block */ + + break; + } + + /* Store prev_block value before freeing the current block + (the current block will be erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } + + ut_ad(block); + + /* Set the free field of block */ + mem_block_set_free(block, + ulint(old_top - reinterpret_cast<byte*>(block))); + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + MEM_NOACCESS(old_top, (byte*) block + block->len - old_top); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } +} + +/** Empties a memory heap. +The first memory block of the heap is not freed. +@param[in] heap heap to empty */ +UNIV_INLINE +void +mem_heap_empty( + mem_heap_t* heap) +{ + mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap)); + + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +} + +/** Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. +@param[in] heap memory heap +@param[in] n size of the topmost element +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( + mem_heap_t* heap, + ulint n) +{ + mem_block_t* block; + byte* buf; + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n); + + return((void*) buf); +} + +/*****************************************************************//** +Frees the topmost element in a memory heap. The size of the element must be +given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: size of the topmost element */ +{ + mem_block_t* block; + + n += REDZONE_SIZE; + + block = UT_LIST_GET_LAST(heap->base); + + /* Subtract the free field of block */ + mem_block_set_free(block, mem_block_get_free(block) + - MEM_SPACE_NEEDED(n)); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } else { + MEM_NOACCESS((byte*) block + mem_block_get_free(block), n); + } +} + +/** Creates a memory heap. +NOTE: Use the corresponding macros instead of this function. +A single user buffer of 'size' will fit in the block. +0 creates a default size block. +@param[in] size Desired start block size. +@param[in] file_name File name where created +@param[in] line Line where created +@param[in] type Heap type +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( + ulint size, +#ifdef UNIV_DEBUG + const char* file_name, + unsigned line, +#endif /* UNIV_DEBUG */ + ulint type) +{ + mem_block_t* block; + + if (!size) { + size = MEM_BLOCK_START_SIZE; + } + + block = mem_heap_create_block(NULL, size, type, file_name, line); + + if (block == NULL) { + + return(NULL); + } + + /* The first block should not be in buffer pool, + because it might be relocated to resize buffer pool. */ + ut_ad(block->buf_block == NULL); + + UT_LIST_INIT(block->base, &mem_block_t::list); + + /* Add the created block itself as the first block in the list */ + UT_LIST_ADD_FIRST(block->base, block); + + return(block); +} + +/** Frees the space occupied by a memory heap. +NOTE: Use the corresponding macro instead of this function. +@param[in] heap Heap to be freed */ +UNIV_INLINE +void +mem_heap_free( + mem_heap_t* heap) +{ + mem_block_t* block; + mem_block_t* prev_block; + + block = UT_LIST_GET_LAST(heap->base); + + if (heap->free_block) { + mem_heap_free_block_free(heap); + } + + while (block != NULL) { + /* Store the contents of info before freeing current block + (it is erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } +} + +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint size = heap->total_size; + + if (heap->free_block) { + size += srv_page_size; + } + + return(size); +} + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str) /*!< in: string to be copied */ +{ + ulint len = strlen(str) + 1; + return(static_cast<char*>(memcpy(ut_malloc_nokey(len), str, len))); +} + +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len) /*!< in: length of str, in bytes */ +{ + char* s = static_cast<char*>(ut_malloc_nokey(len + 1)); + s[len] = 0; + return(static_cast<char*>(memcpy(s, str, len))); +} diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h new file mode 100644 index 00000000..0d83d83b --- /dev/null +++ b/storage/innobase/include/mtr0log.h @@ -0,0 +1,673 @@ +/***************************************************************************** + +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** +@file include/mtr0log.h +Mini-transaction log record encoding and decoding +*******************************************************/ + +#pragma once +#include "mtr0mtr.h" + +/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_2BYTE= 1 << 7; +/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14); +/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21); +/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28); + +/** Error from mlog_decode_varint() */ +constexpr uint32_t MLOG_DECODE_ERROR= ~0U; + +/** Decode the length of a variable-length encoded integer. +@param first first byte of the encoded integer +@return the length, in bytes */ +inline uint8_t mlog_decode_varint_length(byte first) +{ + uint8_t len= 1; + for (; first & 0x80; len++, first= static_cast<uint8_t>(first << 1)); + return len; +} + +/** Decode an integer in a redo log record. +@param log redo log record buffer +@return the decoded integer +@retval MLOG_DECODE_ERROR on error */ +inline uint32_t mlog_decode_varint(const byte* log) +{ + uint32_t i= *log; + if (i < MIN_2BYTE) + return i; + if (i < 0xc0) + return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]); + if (i < 0xe0) + return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]); + if (i < 0xf0) + return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 | + uint32_t{log[2]} << 8 | log[3]); + if (i == 0xf0) + { + i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 | + uint32_t{log[3]} << 8 | log[4]; + if (i <= ~MIN_5BYTE) + return MIN_5BYTE + i; + } + return MLOG_DECODE_ERROR; +} + +/** Encode an integer in a redo log record. +@param log redo log record buffer +@param i the integer to encode +@return end of the encoded integer */ +inline byte *mlog_encode_varint(byte *log, size_t i) +{ +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */ +#endif + if (i < MIN_2BYTE) + { + } + else if (i < MIN_3BYTE) + { + i-= MIN_2BYTE; + static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility"); + *log++= 0x80 | static_cast<byte>(i >> 8); + } + else if (i < MIN_4BYTE) + { + i-= MIN_3BYTE; + static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility"); + *log++= 0xc0 | static_cast<byte>(i >> 16); + goto last2; + } + else if (i < MIN_5BYTE) + { + i-= MIN_4BYTE; + static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility"); + *log++= 0xe0 | static_cast<byte>(i >> 24); + goto last3; + } + else + { + ut_ad(i < MLOG_DECODE_ERROR); + i-= MIN_5BYTE; + *log++= 0xf0; + *log++= static_cast<byte>(i >> 24); +last3: + *log++= static_cast<byte>(i >> 16); +last2: + *log++= static_cast<byte>(i >> 8); + } +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + *log++= static_cast<byte>(i); + return log; +} + +/** Determine the length of a log record. +@param log start of log record +@param end end of the log record buffer +@return the length of the record, in bytes +@retval 0 if the log extends past the end +@retval MLOG_DECODE_ERROR if the record is corrupted */ +inline uint32_t mlog_decode_len(const byte *log, const byte *end) +{ + ut_ad(log < end); + uint32_t i= *log; + if (!i) + return 0; /* end of mini-transaction */ + if (~i & 15) + return (i & 15) + 1; /* 1..16 bytes */ + if (UNIV_UNLIKELY(++log == end)) + return 0; /* end of buffer */ + i= *log; + if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */ + return 16 + i; + if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */ + { + if (UNIV_UNLIKELY(log + 1 == end)) + return 0; /* end of buffer */ + return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]); + } + if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */ + { + if (UNIV_UNLIKELY(log + 2 == end)) + return 0; /* end of buffer */ + return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 | + static_cast<uint32_t>(log[1]) << 8 | log[2]); + } + /* 1,065,103 bytes per log record ought to be enough for everyone */ + return MLOG_DECODE_ERROR; +} + +/** Write 1, 2, 4, or 8 bytes to a file page. +@param[in] block file page +@param[in,out] ptr pointer in file page +@param[in] val value to write +@tparam l number of bytes to write +@tparam w write request type +@tparam V type of val +@return whether any log was written */ +template<unsigned l,mtr_t::write_type w,typename V> +inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val) +{ + ut_ad(ut_align_down(ptr, srv_page_size) == block.frame); + static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length"); + byte buf[l]; + + switch (l) { + case 1: + ut_ad(val == static_cast<byte>(val)); + buf[0]= static_cast<byte>(val); + break; + case 2: + ut_ad(val == static_cast<uint16_t>(val)); + mach_write_to_2(buf, static_cast<uint16_t>(val)); + break; + case 4: + ut_ad(val == static_cast<uint32_t>(val)); + mach_write_to_4(buf, static_cast<uint32_t>(val)); + break; + case 8: + mach_write_to_8(buf, val); + break; + } + byte *p= static_cast<byte*>(ptr); + const byte *const end= p + l; + if (w != FORCED && m_log_mode == MTR_LOG_ALL) + { + const byte *b= buf; + while (*p++ == *b++) + { + if (p == end) + { + ut_ad(w == MAYBE_NOP); + return false; + } + } + p--; + } + ::memcpy(ptr, buf, l); + memcpy_low(block, static_cast<uint16_t> + (ut_align_offset(p, srv_page_size)), p, end - p); + return true; +} + +/** Log an initialization of a string of bytes. +@param[in] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write +@param[in] val the data byte to write */ +inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val) +{ + ut_ad(len); + set_modified(b); + if (m_log_mode != MTR_LOG_ALL) + return; + + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1); + byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs); + l= mlog_encode_varint(l, len); + *l++= val; + m_log.close(l); + m_last_offset= static_cast<uint16_t>(ofs + len); +} + +/** Initialize a string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from block->frame +@param[in] len length of the data to write +@param[in] val the data byte to write */ +inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val) +{ + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + ::memset(ofs + b->frame, val, len); + memset(*b, ofs, len, val); +} + +/** Log an initialization of a repeating string of bytes. +@param[in] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write, in bytes +@param[in] str the string to write +@param[in] size size of str, in bytes */ +inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len, + const void *str, size_t size) +{ + ut_ad(size); + ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */ + set_modified(b); + if (m_log_mode != MTR_LOG_ALL) + return; + + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); + byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs); + l= mlog_encode_varint(l, len); + ::memcpy(l, str, size); + l+= size; + m_log.close(l); + m_last_offset= static_cast<uint16_t>(ofs + len); +} + +/** Initialize a repeating string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write, in bytes +@param[in] str the string to write +@param[in] size size of str, in bytes */ +inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len, + const void *str, size_t size) +{ + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */ + size_t s= 0; + while (s < len) + { + ::memcpy(ofs + s + b->frame, str, size); + s+= len; + } + ::memcpy(ofs + s + b->frame, str, len - s); + memset(*b, ofs, len, str, size); +} + +/** Log a write of a byte string to a page. +@param[in] b buffer page +@param[in] offset byte offset from b->frame +@param[in] str the data to write +@param[in] len length of the data to write */ +inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len) +{ + ut_ad(len); + ut_ad(offset <= ulint(srv_page_size)); + ut_ad(offset + len <= ulint(srv_page_size)); + memcpy_low(b, uint16_t(offset), &b.frame[offset], len); +} + +/** Log a write of a byte string to a page. +@param block page +@param offset byte offset within page +@param data data to be written +@param len length of the data, in bytes */ +inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset, + const void *data, size_t len) +{ + ut_ad(len); + set_modified(block); + if (m_log_mode != MTR_LOG_ALL) + return; + if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5)) + { + byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true, + offset); + ::memcpy(end, data, len); + m_log.close(end + len); + } + else + { + m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false, + offset)); + m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len)); + } + m_last_offset= static_cast<uint16_t>(offset + len); +} + +/** Log that a string of bytes was copied from the same page. +@param[in] b buffer page +@param[in] d destination offset within the page +@param[in] s source offset within the page +@param[in] len length of the data to copy */ +inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len) +{ + ut_ad(d >= 8); + ut_ad(s >= 8); + ut_ad(len); + ut_ad(s <= ulint(srv_page_size)); + ut_ad(s + len <= ulint(srv_page_size)); + ut_ad(s != d); + ut_ad(d <= ulint(srv_page_size)); + ut_ad(d + len <= ulint(srv_page_size)); + + set_modified(b); + if (m_log_mode != MTR_LOG_ALL) + return; + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); + /* The source offset is encoded relative to the destination offset, + with the sign in the least significant bit. */ + if (s > d) + s= (s - d) << 1; + else + s= (d - s) << 1 | 1; + /* The source offset 0 is not possible. */ + s-= 1 << 1; + size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3); + byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d); + l= mlog_encode_varint(l, len); + l= mlog_encode_varint(l, s); + m_log.close(l); + m_last_offset= static_cast<uint16_t>(d + len); +} + +/** +Write a log record. +@tparam type redo log record type +@param id persistent page identifier +@param bpage buffer pool page, or nullptr +@param len number of additional bytes to write +@param alloc whether to allocate the additional bytes +@param offset byte offset, or 0 if the record type does not allow one +@return end of mini-transaction log, minus len */ +template<byte type> +inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage, + size_t len, bool alloc, size_t offset) +{ + static_assert(!(type & 15) && type != RESERVED && type != OPTION && + type <= FILE_CHECKPOINT, "invalid type"); + ut_ad(type >= FILE_CREATE || is_named_space(id.space())); + ut_ad(!bpage || bpage->id() == id); + constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE; + constexpr bool have_offset= type == WRITE || type == MEMSET || + type == MEMMOVE; + static_assert(!have_offset || have_len, "consistency"); + ut_ad(have_len || len == 0); + ut_ad(have_len || !alloc); + ut_ad(have_offset || offset == 0); + ut_ad(offset + len <= srv_page_size); + static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency"); + + size_t max_len; + if (!have_len) + max_len= 1 + 5 + 5; + else if (!have_offset) + max_len= bpage && m_last == bpage + ? 1 + 3 + : 1 + 3 + 5 + 5; + else if (bpage && m_last == bpage && m_last_offset <= offset) + { + /* Encode the offset relative from m_last_offset. */ + offset-= m_last_offset; + max_len= 1 + 3 + 3; + } + else + max_len= 1 + 3 + 5 + 5 + 3; + byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len); + byte *end= log_ptr + 1; + const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0; + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + m_last= bpage; + } + if (have_offset) + { + byte* oend= mlog_encode_varint(end, offset); + if (oend + len > &log_ptr[16]) + { + len+= oend - log_ptr - 15; + if (len >= MIN_3BYTE - 1) + len+= 2; + else if (len >= MIN_2BYTE) + len++; + + *log_ptr= type | same_page; + end= mlog_encode_varint(log_ptr + 1, len); + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + } + end= mlog_encode_varint(end, offset); + return end; + } + else + end= oend; + } + else if (len >= 3 && end + len > &log_ptr[16]) + { + len+= end - log_ptr - 15; + if (len >= MIN_3BYTE - 1) + len+= 2; + else if (len >= MIN_2BYTE) + len++; + + end= log_ptr; + *end++= type | same_page; + end= mlog_encode_varint(end, len); + + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + } + return end; + } + + ut_ad(end + len >= &log_ptr[1] + !same_page); + ut_ad(end + len <= &log_ptr[16]); + ut_ad(end <= &log_ptr[max_len]); + *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1); + ut_ad(*log_ptr & 15); + return end; +} + +/** Write a byte string to a page. +@param[in] b buffer page +@param[in] dest destination within b.frame +@param[in] str the data to write +@param[in] len length of the data to write +@tparam w write request type */ +template<mtr_t::write_type w> +inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str, + ulint len) +{ + ut_ad(ut_align_down(dest, srv_page_size) == b.frame); + char *d= static_cast<char*>(dest); + const char *s= static_cast<const char*>(str); + if (w != FORCED && m_log_mode == MTR_LOG_ALL) + { + ut_ad(len); + const char *const end= d + len; + while (*d++ == *s++) + { + if (d == end) + { + ut_ad(w == MAYBE_NOP); + return; + } + } + s--; + d--; + len= static_cast<ulint>(end - d); + } + ::memcpy(d, s, len); + memcpy(b, ut_align_offset(d, srv_page_size), len); +} + +/** Initialize an entire page. +@param[in,out] b buffer page */ +inline void mtr_t::init(buf_block_t *b) +{ + const page_id_t id{b->page.id()}; + ut_ad(is_named_space(id.space())); + ut_ad(!m_freed_pages == !m_freed_space); + + if (UNIV_LIKELY_NULL(m_freed_space) && + m_freed_space->id == id.space() && + m_freed_pages->remove_if_exists(b->page.id().page_no()) && + m_freed_pages->empty()) + { + delete m_freed_pages; + m_freed_pages= nullptr; + m_freed_space= nullptr; + } + + b->page.status= buf_page_t::INIT_ON_FLUSH; + + if (m_log_mode != MTR_LOG_ALL) + { + ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO); + return; + } + + m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page)); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Free a page. +@param[in] space tablespace contains page to be freed +@param[in] offset page offset to be freed */ +inline void mtr_t::free(fil_space_t &space, uint32_t offset) +{ + ut_ad(is_named_space(&space)); + ut_ad(!m_freed_space || m_freed_space == &space); + + if (m_log_mode == MTR_LOG_ALL) + m_log.close(log_write<FREE_PAGE>({space.id, offset}, nullptr)); +} + +/** Write an EXTENDED log record. +@param block buffer pool page +@param type extended record subtype; @see mrec_ext_t */ +inline void mtr_t::log_write_extended(const buf_block_t &block, byte type) +{ + set_modified(block); + if (m_log_mode != MTR_LOG_ALL) + return; + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true); + *l++= type; + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for partly initializing a B-tree or R-tree page. +@param block B-tree or R-tree page +@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */ +inline void mtr_t::page_create(const buf_block_t &block, bool comp) +{ + static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding"); + static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding"); + log_write_extended(block, comp); +} + +/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT. +@param block B-tree or R-tree page +@param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_OLD_INFIMUM */ +inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec) +{ + ut_ad(!block.zip_size()); + ut_ad(prev_rec < block.physical_size()); + set_modified(block); + if (m_log_mode != MTR_LOG_ALL) + return; + size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true); + ut_d(byte *end= l + len); + *l++= DELETE_ROW_FORMAT_REDUNDANT; + l= mlog_encode_varint(l, prev_rec); + ut_ad(end == l); + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record. +@param block B-tree or R-tree page +@param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_NEW_INFIMUM +@param prev_rec the predecessor of the record to delete +@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES +@param data_size data payload size, in bytes */ +inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec, + size_t hdr_size, size_t data_size) +{ + ut_ad(!block.zip_size()); + set_modified(block); + ut_ad(hdr_size < MIN_3BYTE); + ut_ad(prev_rec < block.physical_size()); + ut_ad(data_size < block.physical_size()); + if (m_log_mode != MTR_LOG_ALL) + return; + size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4; + len+= hdr_size < MIN_2BYTE ? 1 : 2; + len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3; + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true); + ut_d(byte *end= l + len); + *l++= DELETE_ROW_FORMAT_DYNAMIC; + l= mlog_encode_varint(l, prev_rec); + l= mlog_encode_varint(l, hdr_size); + l= mlog_encode_varint(l, data_size); + ut_ad(end == l); + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for initializing an undo log page. +@param block undo page */ +inline void mtr_t::undo_create(const buf_block_t &block) +{ + log_write_extended(block, UNDO_INIT); +} + +/** Write log for appending an undo log record. +@param block undo page +@param data record within the undo page +@param len length of the undo record, in bytes */ +inline void mtr_t::undo_append(const buf_block_t &block, + const void *data, size_t len) +{ + ut_ad(len > 2); + set_modified(block); + if (m_log_mode != MTR_LOG_ALL) + return; + const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); + byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small); + if (UNIV_LIKELY(small)) + { + *end++= UNDO_APPEND; + ::memcpy(end, data, len); + m_log.close(end + len); + } + else + { + m_log.close(end); + *m_log.push<byte*>(1)= UNDO_APPEND; + m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len)); + } + m_last_offset= FIL_PAGE_TYPE; +} + +/** Trim the end of a tablespace. +@param id first page identifier that will not be in the file */ +inline void mtr_t::trim_pages(const page_id_t id) +{ + if (m_log_mode != MTR_LOG_ALL) + return; + byte *l= log_write<EXTENDED>(id, nullptr, 1, true); + *l++= TRIM_PAGES; + m_log.close(l); + set_trim_pages(); +} diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h new file mode 100644 index 00000000..f3db0008 --- /dev/null +++ b/storage/innobase/include/mtr0mtr.h @@ -0,0 +1,696 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.h +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0mtr_h +#define mtr0mtr_h + +#include "fil0fil.h" +#include "dyn0buf.h" + +/** Start a mini-transaction. */ +#define mtr_start(m) (m)->start() + +/** Commit a mini-transaction. */ +#define mtr_commit(m) (m)->commit() + +/** Set and return a savepoint in mtr. +@return savepoint */ +#define mtr_set_savepoint(m) (m)->get_savepoint() + +/** Release the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +#define mtr_release_s_latch_at_savepoint(m, s, l) \ + (m)->release_s_latch_at_savepoint((s), (l)) + +/** Change the logging mode of a mini-transaction. +@return old mode */ +#define mtr_set_log_mode(m, d) (m)->set_log_mode((d)) + +/** Release an object in the memo stack. +@return true if released */ +#define mtr_memo_release(m, o, t) \ + (m)->memo_release((o), (t)) + +/** Print info of an mtr handle. */ +#define mtr_print(m) (m)->print() + +/** Return the log object of a mini-transaction buffer. +@return log */ +#define mtr_get_log(m) (m)->get_log() + +/** Push an object to an mtr memo stack. */ +#define mtr_memo_push(m, o, t) (m)->memo_push(o, t) + +#define mtr_x_lock_space(s, m) (m)->x_lock_space((s), __FILE__, __LINE__) +#define mtr_sx_lock_space(s, m) (m)->sx_lock_space((s), __FILE__, __LINE__) + +#define mtr_s_lock_index(i, m) (m)->s_lock(&(i)->lock, __FILE__, __LINE__) +#define mtr_x_lock_index(i, m) (m)->x_lock(&(i)->lock, __FILE__, __LINE__) +#define mtr_sx_lock_index(i, m) (m)->sx_lock(&(i)->lock, __FILE__, __LINE__) + +#define mtr_release_block_at_savepoint(m, s, b) \ + (m)->release_block_at_savepoint((s), (b)) + +#define mtr_block_sx_latch_at_savepoint(m, s, b) \ + (m)->sx_latch_at_savepoint((s), (b)) + +#define mtr_block_x_latch_at_savepoint(m, s, b) \ + (m)->x_latch_at_savepoint((s), (b)) + +/** Mini-transaction memo stack slot. */ +struct mtr_memo_slot_t { + /** pointer to the object */ + void* object; + + /** type of the stored object */ + mtr_memo_type_t type; +}; + +/** Mini-transaction handle and buffer */ +struct mtr_t { + /** Start a mini-transaction. */ + void start(); + + /** Commit the mini-transaction. */ + void commit(); + + /** Commit a mini-transaction that did not modify any pages, + but generated some redo log on a higher level, such as + FILE_MODIFY records and an optional FILE_CHECKPOINT marker. + The caller must hold log_sys.mutex. + This is to be used at log_checkpoint(). + @param checkpoint_lsn the log sequence number of a checkpoint, or 0 */ + void commit_files(lsn_t checkpoint_lsn= 0); + + /** @return mini-transaction savepoint (current size of m_memo) */ + ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); } + + /** Release the (index tree) s-latch stored in an mtr memo after a + savepoint. + @param savepoint value returned by @see set_savepoint. + @param lock latch to release */ + inline void release_s_latch_at_savepoint( + ulint savepoint, + rw_lock_t* lock); + + /** Release the block in an mtr memo after a savepoint. */ + inline void release_block_at_savepoint( + ulint savepoint, + buf_block_t* block); + + /** SX-latch a not yet latched block after a savepoint. */ + inline void sx_latch_at_savepoint(ulint savepoint, buf_block_t* block); + + /** X-latch a not yet latched block after a savepoint. */ + inline void x_latch_at_savepoint(ulint savepoint, buf_block_t* block); + + /** @return the logging mode */ + mtr_log_t get_log_mode() const + { + static_assert(MTR_LOG_ALL == 0, "efficiency"); + ut_ad(m_log_mode <= MTR_LOG_NO_REDO); + return static_cast<mtr_log_t>(m_log_mode); + } + + /** Change the logging mode. + @param mode logging mode + @return old mode */ + mtr_log_t set_log_mode(mtr_log_t mode) + { + const mtr_log_t old_mode= get_log_mode(); + m_log_mode= mode & 3; + return old_mode; + } + + /** Check if we are holding a block latch in exclusive mode + @param block buffer pool block to search for */ + bool have_x_latch(const buf_block_t &block) const; + + /** Copy the tablespaces associated with the mini-transaction + (needed for generating FILE_MODIFY records) + @param[in] mtr mini-transaction that may modify + the same set of tablespaces as this one */ + void set_spaces(const mtr_t& mtr) + { + ut_ad(!m_user_space_id); + ut_ad(!m_user_space); + + ut_d(m_user_space_id = mtr.m_user_space_id); + m_user_space = mtr.m_user_space; + } + + /** Set the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space_id user or system tablespace ID + @return the tablespace */ + fil_space_t* set_named_space_id(ulint space_id) + { + ut_ad(!m_user_space_id); + ut_d(m_user_space_id = static_cast<uint32_t>(space_id)); + if (!space_id) { + return fil_system.sys_space; + } else { + ut_ad(m_user_space_id == space_id); + ut_ad(!m_user_space); + m_user_space = fil_space_get(space_id); + ut_ad(m_user_space); + return m_user_space; + } + } + + /** Set the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space user or system tablespace */ + void set_named_space(fil_space_t* space) + { + ut_ad(!m_user_space_id); + ut_d(m_user_space_id = static_cast<uint32_t>(space->id)); + if (space->id) { + m_user_space = space; + } + } + +#ifdef UNIV_DEBUG + /** Check the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space tablespace + @return whether the mini-transaction is associated with the space */ + bool is_named_space(ulint space) const; + /** Check the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space tablespace + @return whether the mini-transaction is associated with the space */ + bool is_named_space(const fil_space_t* space) const; +#endif /* UNIV_DEBUG */ + + /** Acquire a tablespace X-latch. + @param[in] space_id tablespace ID + @param[in] file file name from where called + @param[in] line line number in file + @return the tablespace object (never NULL) */ + fil_space_t* x_lock_space( + ulint space_id, + const char* file, + unsigned line); + + /** Acquire a shared rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void s_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_s_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_S_LOCK); + } + + /** Acquire an exclusive rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void x_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_x_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_X_LOCK); + } + + /** Acquire an shared/exclusive rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void sx_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_sx_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_SX_LOCK); + } + + /** Acquire a tablespace X-latch. + @param[in] space tablespace + @param[in] file file name from where called + @param[in] line line number in file */ + void x_lock_space(fil_space_t* space, const char* file, unsigned line) + { + ut_ad(space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT + || space->purpose == FIL_TYPE_TABLESPACE); + memo_push(space, MTR_MEMO_SPACE_X_LOCK); + rw_lock_x_lock_inline(&space->latch, 0, file, line); + } + + /** Acquire a tablespace SX-latch. + @param[in] space tablespace + @param[in] file file name from where called + @param[in] line line number in file */ + void sx_lock_space(fil_space_t *space, const char *file, unsigned line) + { + ut_ad(space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT + || space->purpose == FIL_TYPE_TABLESPACE); + sx_lock(&space->latch, file, line); + } + + /** Release an object in the memo stack. + @param object object + @param type object type + @return bool if lock released */ + bool memo_release(const void* object, ulint type); + /** Release a page latch. + @param[in] ptr pointer to within a page frame + @param[in] type object type: MTR_MEMO_PAGE_X_FIX, ... */ + void release_page(const void* ptr, mtr_memo_type_t type); + +private: + /** Note that the mini-transaction will modify data. */ + void flag_modified() { m_modifications = true; } + /** Mark the given latched page as modified. + @param block page that will be modified */ + void modify(const buf_block_t& block); +public: + /** Note that the mini-transaction will modify a block. */ + void set_modified(const buf_block_t &block) + { flag_modified(); if (m_log_mode != MTR_LOG_NONE) modify(block); } + + /** Set the state to not-modified. This will not log the changes. + This is only used during redo log apply, to avoid logging the changes. */ + void discard_modifications() { m_modifications = false; } + + /** Get the LSN of commit(). + @return the commit LSN + @retval 0 if the transaction only modified temporary tablespaces */ + lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; } + + /** Note that we are inside the change buffer code. */ + void enter_ibuf() { m_inside_ibuf= true; } + + /** Note that we have exited from the change buffer code. */ + void exit_ibuf() { m_inside_ibuf= false; } + + /** @return true if we are inside the change buffer code */ + bool is_inside_ibuf() const { return m_inside_ibuf; } + + /** Note that pages has been trimed */ + void set_trim_pages() { m_trim_pages= true; } + + /** @return true if pages has been trimed */ + bool is_trim_pages() { return m_trim_pages; } + +#ifdef UNIV_DEBUG + /** Check if we are holding an rw-latch in this mini-transaction + @param lock latch to search for + @param type held latch type + @return whether (lock,type) is contained */ + bool memo_contains(const rw_lock_t &lock, mtr_memo_type_t type) + MY_ATTRIBUTE((warn_unused_result)); + /** Check if we are holding exclusive tablespace latch + @param space tablespace to search for + @return whether space.latch is being held */ + bool memo_contains(const fil_space_t& space) + MY_ATTRIBUTE((warn_unused_result)); + + + /** Check if memo contains the given item. + @param object object to search + @param flags specify types of object (can be ORred) of + MTR_MEMO_PAGE_S_FIX ... values + @return true if contains */ + bool memo_contains_flagged(const void* ptr, ulint flags) const; + + /** Check if memo contains the given page. + @param[in] ptr pointer to within buffer frame + @param[in] flags specify types of object with OR of + MTR_MEMO_PAGE_S_FIX... values + @return the block + @retval NULL if not found */ + buf_block_t* memo_contains_page_flagged( + const byte* ptr, + ulint flags) const; + + /** Print info of an mtr handle. */ + void print() const; + + /** @return true if mini-transaction contains modifications. */ + bool has_modifications() const { return m_modifications; } + + /** @return the memo stack */ + const mtr_buf_t* get_memo() const { return &m_memo; } + + /** @return the memo stack */ + mtr_buf_t* get_memo() { return &m_memo; } +#endif /* UNIV_DEBUG */ + + /** @return true if a record was added to the mini-transaction */ + bool is_dirty() const { return m_made_dirty; } + + /** Get the buffered redo log of this mini-transaction. + @return redo log */ + const mtr_buf_t* get_log() const { return &m_log; } + + /** Get the buffered redo log of this mini-transaction. + @return redo log */ + mtr_buf_t* get_log() { return &m_log; } + + /** Push an object to an mtr memo stack. + @param object object + @param type object type: MTR_MEMO_S_LOCK, ... */ + inline void memo_push(void* object, mtr_memo_type_t type); + + /** Check if this mini-transaction is dirtying a clean page. + @param block block being x-fixed + @return true if the mtr is dirtying a clean page. */ + static inline bool is_block_dirtied(const buf_block_t* block) + MY_ATTRIBUTE((warn_unused_result)); + + /** Write request types */ + enum write_type + { + /** the page is guaranteed to always change */ + NORMAL= 0, + /** optional: the page contents might not change */ + MAYBE_NOP, + /** force a write, even if the page contents is not changing */ + FORCED + }; + + /** Write 1, 2, 4, or 8 bytes to a file page. + @param[in] block file page + @param[in,out] ptr pointer in file page + @param[in] val value to write + @tparam l number of bytes to write + @tparam w write request type + @tparam V type of val + @return whether any log was written */ + template<unsigned l,write_type w= NORMAL,typename V> + inline bool write(const buf_block_t &block, void *ptr, V val) + MY_ATTRIBUTE((nonnull)); + + /** Log a write of a byte string to a page. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write */ + inline void memcpy(const buf_block_t &b, ulint ofs, ulint len); + + /** Write a byte string to a page. + @param[in,out] b buffer page + @param[in] dest destination within b.frame + @param[in] str the data to write + @param[in] len length of the data to write + @tparam w write request type */ + template<write_type w= NORMAL> + inline void memcpy(const buf_block_t &b, void *dest, const void *str, + ulint len); + + /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page. + @param[in] b ROW_FORMAT=COMPRESSED index page + @param[in] offset byte offset from b.zip.data + @param[in] len length of the data to write */ + inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len); + + /** Write a byte string to a ROW_FORMAT=COMPRESSED page. + @param[in] b ROW_FORMAT=COMPRESSED index page + @param[in] dest destination within b.zip.data + @param[in] str the data to write + @param[in] len length of the data to write + @tparam w write request type */ + template<write_type w= NORMAL> + inline void zmemcpy(const buf_block_t &b, void *dest, const void *str, + ulint len); + + /** Log an initialization of a string of bytes. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write + @param[in] val the data byte to write */ + inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val); + + /** Initialize a string of bytes. + @param[in,out] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write + @param[in] val the data byte to write */ + inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val); + + /** Log an initialization of a repeating string of bytes. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write, in bytes + @param[in] str the string to write + @param[in] size size of str, in bytes */ + inline void memset(const buf_block_t &b, ulint ofs, size_t len, + const void *str, size_t size); + + /** Initialize a repeating string of bytes. + @param[in,out] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write, in bytes + @param[in] str the string to write + @param[in] size size of str, in bytes */ + inline void memset(const buf_block_t *b, ulint ofs, size_t len, + const void *str, size_t size); + + /** Log that a string of bytes was copied from the same page. + @param[in] b buffer page + @param[in] d destination offset within the page + @param[in] s source offset within the page + @param[in] len length of the data to copy */ + inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len); + + /** Initialize an entire page. + @param[in,out] b buffer page */ + void init(buf_block_t *b); + /** Free a page. + @param[in] space tablespace contains page to be freed + @param[in] offset page offset to be freed */ + inline void free(fil_space_t &space, uint32_t offset); + /** Write log for partly initializing a B-tree or R-tree page. + @param block B-tree or R-tree page + @param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */ + inline void page_create(const buf_block_t &block, bool comp); + + /** Write log for inserting a B-tree or R-tree record in + ROW_FORMAT=REDUNDANT. + @param block B-tree or R-tree page + @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE + @param prev_rec byte offset of the predecessor of the record to insert, + starting from PAGE_OLD_INFIMUM + @param info_bits info_bits of the record + @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag() + @param hdr_c number of common record header bytes with prev_rec + @param data_c number of common data bytes with prev_rec + @param hdr record header bytes to copy to the log + @param hdr_l number of copied record header bytes + @param data record payload bytes to copy to the log + @param data_l number of copied record data bytes */ + inline void page_insert(const buf_block_t &block, bool reuse, + ulint prev_rec, byte info_bits, + ulint n_fields_s, size_t hdr_c, size_t data_c, + const byte *hdr, size_t hdr_l, + const byte *data, size_t data_l); + /** Write log for inserting a B-tree or R-tree record in + ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC. + @param block B-tree or R-tree page + @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE + @param prev_rec byte offset of the predecessor of the record to insert, + starting from PAGE_NEW_INFIMUM + @param info_status rec_get_info_and_status_bits() + @param shift unless !reuse: number of bytes the PAGE_FREE is moving + @param hdr_c number of common record header bytes with prev_rec + @param data_c number of common data bytes with prev_rec + @param hdr record header bytes to copy to the log + @param hdr_l number of copied record header bytes + @param data record payload bytes to copy to the log + @param data_l number of copied record data bytes */ + inline void page_insert(const buf_block_t &block, bool reuse, + ulint prev_rec, byte info_status, + ssize_t shift, size_t hdr_c, size_t data_c, + const byte *hdr, size_t hdr_l, + const byte *data, size_t data_l); + /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT. + @param block B-tree or R-tree page + @param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_OLD_INFIMUM */ + inline void page_delete(const buf_block_t &block, ulint prev_rec); + /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record. + @param block B-tree or R-tree page + @param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_NEW_INFIMUM + @param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES + @param data_size data payload size, in bytes */ + inline void page_delete(const buf_block_t &block, ulint prev_rec, + size_t hdr_size, size_t data_size); + + /** Write log for initializing an undo log page. + @param block undo page */ + inline void undo_create(const buf_block_t &block); + /** Write log for appending an undo log record. + @param block undo page + @param data record within the undo page + @param len length of the undo record, in bytes */ + inline void undo_append(const buf_block_t &block, + const void *data, size_t len); + /** Trim the end of a tablespace. + @param id first page identifier that will not be in the file */ + inline void trim_pages(const page_id_t id); + + /** Write a log record about a file operation. + @param type file operation + @param space_id tablespace identifier + @param path file path + @param new_path new file path for type=FILE_RENAME */ + inline void log_file_op(mfile_type_t type, ulint space_id, + const char *path, + const char *new_path= nullptr); + + /** Add freed page numbers to freed_pages */ + void add_freed_offset(fil_space_t *space, uint32_t page) + { + ut_ad(is_named_space(space)); + if (!m_freed_pages) + { + m_freed_pages= new range_set(); + ut_ad(!m_freed_space); + m_freed_space= space; + } + else + ut_ad(m_freed_space == space); + m_freed_pages->add_value(page); + } + + /** Determine the added buffer fix count of a block. + @param block block to be checked + @return number of buffer count added by this mtr */ + uint32_t get_fix_count(const buf_block_t *block) const; + + /** type of page flushing is needed during commit() */ + enum page_flush_ahead + { + /** no need to trigger page cleaner */ + PAGE_FLUSH_NO= 0, + /** asynchronous flushing is needed */ + PAGE_FLUSH_ASYNC, + /** furious flushing is needed */ + PAGE_FLUSH_SYNC + }; + +private: + /** Log a write of a byte string to a page. + @param block buffer page + @param offset byte offset within page + @param data data to be written + @param len length of the data, in bytes */ + inline void memcpy_low(const buf_block_t &block, uint16_t offset, + const void *data, size_t len); + /** + Write a log record. + @tparam type redo log record type + @param id persistent page identifier + @param bpage buffer pool page, or nullptr + @param len number of additional bytes to write + @param alloc whether to allocate the additional bytes + @param offset byte offset, or 0 if the record type does not allow one + @return end of mini-transaction log, minus len */ + template<byte type> + inline byte *log_write(const page_id_t id, const buf_page_t *bpage, + size_t len= 0, bool alloc= false, size_t offset= 0); + + /** Write an EXTENDED log record. + @param block buffer pool page + @param type extended record subtype; @see mrec_ext_t */ + inline void log_write_extended(const buf_block_t &block, byte type); + + /** Prepare to write the mini-transaction log to the redo log buffer. + @return number of bytes to write in finish_write() */ + inline ulint prepare_write(); + + /** Append the redo log records to the redo log buffer. + @param len number of bytes to write + @return {start_lsn,flush_ahead} */ + inline std::pair<lsn_t,page_flush_ahead> finish_write(ulint len); + + /** Release the resources */ + inline void release_resources(); + +#ifdef UNIV_DEBUG +public: + /** @return whether the mini-transaction is active */ + bool is_active() const + { ut_ad(!m_commit || m_start); return m_start && !m_commit; } + /** @return whether the mini-transaction has been committed */ + bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; } +private: + /** whether start() has been called */ + bool m_start= false; + /** whether commit() has been called */ + bool m_commit= false; +#endif + + /** The page of the most recent m_log record written, or NULL */ + const buf_page_t* m_last; + /** The current byte offset in m_last, or 0 */ + uint16_t m_last_offset; + + /** specifies which operations should be logged; default MTR_LOG_ALL */ + uint16_t m_log_mode:2; + + /** whether at least one buffer pool page was written to */ + uint16_t m_modifications:1; + + /** whether at least one previously clean buffer pool page was written to */ + uint16_t m_made_dirty:1; + + /** whether change buffer is latched; only needed in non-debug builds + to suppress some read-ahead operations, @see ibuf_inside() */ + uint16_t m_inside_ibuf:1; + + /** whether the pages has been trimmed */ + uint16_t m_trim_pages:1; + +#ifdef UNIV_DEBUG + /** Persistent user tablespace associated with the + mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */ + uint32_t m_user_space_id; +#endif /* UNIV_DEBUG */ + + /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */ + mtr_buf_t m_memo; + + /** mini-transaction log */ + mtr_buf_t m_log; + + /** user tablespace that is being modified by the mini-transaction */ + fil_space_t* m_user_space; + + /** LSN at commit time */ + lsn_t m_commit_lsn; + + /** tablespace where pages have been freed */ + fil_space_t *m_freed_space= nullptr; + /** set of freed page ids */ + range_set *m_freed_pages= nullptr; +}; + +#include "mtr0mtr.ic" + +#endif /* mtr0mtr_h */ diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic new file mode 100644 index 00000000..4a483379 --- /dev/null +++ b/storage/innobase/include/mtr0mtr.ic @@ -0,0 +1,173 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.ic +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" + +/** Check if a mini-transaction is dirtying a clean page. +@return true if the mtr is dirtying a clean page. */ +inline bool mtr_t::is_block_dirtied(const buf_block_t *block) +{ + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count()); + return block->page.oldest_modification() <= 1; +} + +/** +Pushes an object to an mtr memo stack. */ +void +mtr_t::memo_push(void* object, mtr_memo_type_t type) +{ + ut_ad(is_active()); + ut_ad(object != NULL); + ut_ad(type >= MTR_MEMO_PAGE_S_FIX); + ut_ad(type <= MTR_MEMO_SPACE_X_LOCK); + ut_ad(ut_is_2pow(type)); + + /* If this mtr has x-fixed a clean page then we set + the made_dirty flag. This tells us if we need to + grab log_flush_order_mutex at mtr_commit so that we + can insert the dirtied page to the flush list. */ + + if ((type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX) + && !m_made_dirty) { + + m_made_dirty = is_block_dirtied( + reinterpret_cast<const buf_block_t*>(object)); + } + + mtr_memo_slot_t* slot = m_memo.push<mtr_memo_slot_t*>(sizeof(*slot)); + + slot->type = type; + slot->object = object; +} + +/** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +void +mtr_t::release_s_latch_at_savepoint( + ulint savepoint, + rw_lock_t* lock) +{ + ut_ad(is_active()); + ut_ad(m_memo.size() > savepoint); + + mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint); + + ut_ad(slot->object == lock); + ut_ad(slot->type == MTR_MEMO_S_LOCK); + + rw_lock_s_unlock(lock); + + slot->object = NULL; +} + +/** +SX-latches the not yet latched block after a savepoint. */ + +void +mtr_t::sx_latch_at_savepoint( + ulint savepoint, + buf_block_t* block) +{ + ut_ad(is_active()); + ut_ad(m_memo.size() > savepoint); + + ut_ad(!memo_contains_flagged( + block, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_SX_FIX)); + + mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint); + + ut_ad(slot->object == block); + + /* == RW_NO_LATCH */ + ut_a(slot->type == MTR_MEMO_BUF_FIX); + + rw_lock_sx_lock(&block->lock); + + if (!m_made_dirty) { + m_made_dirty = is_block_dirtied(block); + } + + slot->type = MTR_MEMO_PAGE_SX_FIX; +} + +/** +X-latches the not yet latched block after a savepoint. */ + +void +mtr_t::x_latch_at_savepoint( + ulint savepoint, + buf_block_t* block) +{ + ut_ad(is_active()); + ut_ad(m_memo.size() > savepoint); + + ut_ad(!memo_contains_flagged( + block, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_SX_FIX)); + + mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint); + + ut_ad(slot->object == block); + + /* == RW_NO_LATCH */ + ut_a(slot->type == MTR_MEMO_BUF_FIX); + + rw_lock_x_lock(&block->lock); + + if (!m_made_dirty) { + m_made_dirty = is_block_dirtied(block); + } + + slot->type = MTR_MEMO_PAGE_X_FIX; +} + +/** +Releases the block in an mtr memo after a savepoint. */ + +void +mtr_t::release_block_at_savepoint( + ulint savepoint, + buf_block_t* block) +{ + ut_ad(is_active()); + + mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint); + + ut_a(slot->object == block); + + buf_page_release_latch(block, slot->type); + + reinterpret_cast<buf_block_t*>(block)->unfix(); + + slot->object = NULL; +} diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h new file mode 100644 index 00000000..d1b6784a --- /dev/null +++ b/storage/innobase/include/mtr0types.h @@ -0,0 +1,347 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0types.h +Mini-transaction buffer global types + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0types_h +#define mtr0types_h + +#ifndef UNIV_INNOCHECKSUM +#include "sync0rw.h" +#else +#include "univ.i" +#endif /* UNIV_INNOCHECKSUM */ + +struct mtr_t; + +/** Logging modes for a mini-transaction */ +enum mtr_log_t { + /** Default mode: log all operations modifying disk-based data */ + MTR_LOG_ALL = 0, + + /** Log no operations and dirty pages are not added to the flush list. + Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */ + MTR_LOG_NONE, + + /** Don't generate REDO log but add dirty pages to flush list */ + MTR_LOG_NO_REDO +}; + +/* +A mini-transaction is a stream of records that is always terminated by +a NUL byte. The first byte of a mini-transaction record is never NUL, +but NUL bytes can occur within mini-transaction records. The first +bytes of each record will explicitly encode the length of the record. +NUL bytes also acts as padding in log blocks, that is, there can be +multiple sucessive NUL bytes between mini-transactions in a redo log +block. + +The first byte of the record would contain a record type, flags, and a +part of length. The optional second byte of the record will contain +more length. (Not needed for short records.) + +Bit 7 of the first byte of a redo log record is the same_page flag. +If same_page=1, the record is referring to the same page as the +previous record. Records that do not refer to data pages but to file +operations are identified by setting the same_page=1 in the very first +record(s) of the mini-transaction. A mini-transaction record that +carries same_page=0 must only be followed by page-oriented records. + +Bits 6..4 of the first byte of a redo log record identify the redo log +type. The following record types refer to data pages: + + FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE + INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2 + EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t + WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_* + MEMSET (4): extends the 10.4 MLOG_MEMSET record + MEMMOVE (5): copy data within the page (avoids logging redundant data) + RESERVED (6): reserved for future use; a subtype code + (encoded immediately after the length) would be written + to reserve code space for further extensions + OPTION (7): optional record that may be ignored; a subtype code + (encoded immediately after the length) would distinguish actual + usage, such as: + * MDEV-18976 page checksum record + * binlog record + * SQL statement (at the start of statement) + +Bits 3..0 indicate the redo log record length, excluding the first +byte, but including additional length bytes and any other bytes, +such as the optional tablespace identifier and page number. +Values 1..15 represent lengths of 1 to 15 bytes. The special value 0 +indicates that 1 to 3 length bytes will follow to encode the remaining +length that exceeds 16 bytes. + +Additional length bytes if length>16: 0 to 3 bytes +0xxxxxxx for 0 to 127 (total: 16 to 143 bytes) +10xxxxxx xxxxxxxx for 128 to 16511 (total: 144 to 16527) +110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679) +111xxxxx reserved (corrupted record, and file!) + +If same_page=0, the tablespace identifier and page number will use +similar 1-to-5-byte variable-length encoding: +0xxxxxxx for 0 to 127 +10xxxxxx xxxxxxxx for 128 to 16,511 +110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663 +1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx for 2,113,664 to 270,549,119 +11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487 +11111xxx reserved (corrupted record) +Note: Some 5-byte values are reserved, because the tablespace identifier +and page number can only be up to 4,294,967,295. + +If same_page=1 is set in a record that follows a same_page=0 record +in a mini-transaction, the tablespace identifier and page number +fields will be omitted. + +(For some file-oriented records (if same_page=1 for the first records +of a mini-transaction), we will write tablespace identifier using the +same 1-to-5-byte encoding. TBD: describe the exact format of +file-oriented records. With MDEV-14425, we could write file-level log +records to a separate file, not interleaved with page-level redo log +at all. We could reserve the file ib_logfile0 for checkpoint information +and for file-level redo log records.) + +For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated +as corrupted (or reserved for future extension). The type code must +be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier +and page number). If the record length does not match the encoded +lengths of the tablespace identifier and page number, the record will +be treated as corrupted. This allows future expansion of the format. + +If there is a FREE_PAGE record in a mini-transaction, it must be the +only record for that page in the mini-transaction. If there is an +INIT_PAGE record for a page in a mini-transaction, it must be the +first record for that page in the mini-transaction. + +An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page +identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t + +For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset +on the page, relative from the previous offset. If same_page=0, the +"previous offset" is 0. If same_page=1, the "previous offset" is where +the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE). +0xxxxxxx for 0 to 127 +10xxxxxx xxxxxxxx for 128 to 16,511 +110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663 +111xxxxx reserved (corrupted record) +If the sum of the "previous offset" and the current offset exceeds the +page size, the record is treated as corrupted. Negative relative offsets +cannot be written. Instead, a record with same_page=0 can be written. + +For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to +3 bytes. If the length+offset exceeds the page size, the record will +be treated as corrupted. + +For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes, +relative to the current offset. The offset 0 is not possible, and +the sign bit is the least significant bit. That is, ++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and +-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...). +The source offset must be within the page size, or else the record +will be treated as corrupted. + +For MEMSET or WRITE, the byte(s) to be written will follow. For +MEMSET, it usually is a single byte, but it could also be a multi-byte +string, which would be copied over and over until the target length is +reached. The length of the remaining bytes is implied by the length +bytes at the start of the record. + +For MEMMOVE, if any bytes follow, the record is treated as corrupted +(future expansion). + +As mentioned at the start of this comment, the type byte 0 would be +special, marking the end of a mini-transaction. We could use the +corresponding value 0x80 (with same_page=1) for something special, +such as a future extension when more type codes are needed, or for +encoding rarely needed redo log records. + +Examples: + +INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2 +bytes to follow" and "tablespace ID 0x34", "page number 0x56". +The first byte must be between 0x12 and 0x1a, and the total length of +the record must match the lengths of the encoded tablespace ID and +page number. + +WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning +"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40", +"page number 0x57", "byte offset 0x60", data 0x34,0x56. + +A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23 +0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to +follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78. + +The end of the mini-transaction would be indicated by a NUL byte. +*/ + +/** Redo log record types. These bit patterns (3 bits) will be written +to the redo log file, so the existing codes or their interpretation on +crash recovery must not be changed. */ +enum mrec_type_t +{ + /** Free a page. On recovery, it is unnecessary to read the page. + The next record for the page (if any) must be INIT_PAGE. + After this record has been written, the page may be + overwritten with zeros, or discarded or trimmed. */ + FREE_PAGE= 0, + /** Zero-initialize a page. The current byte offset (for subsequent + records) will be reset to FIL_PAGE_TYPE. */ + INIT_PAGE= 0x10, + /** Insert a record into a page. FIXME: implement this! */ + EXTENDED= 0x20, + /** Write a string of bytes. Followed by the byte offset (unsigned, + relative to the current byte offset, encoded in 1 to 3 bytes) and + the bytes to write (at least one). The current byte offset will be + set after the last byte written. */ + WRITE= 0x30, + /** Like WRITE, but before the bytes to write, the data_length-1 + (encoded in 1 to 3 bytes) will be encoded, and it must be more + than the length of the following data bytes to write. + The data byte(s) will be repeatedly copied to the output until + the data_length is reached. */ + MEMSET= 0x40, + /** Like MEMSET, but instead of the bytes to write, a source byte + offset (signed, nonzero, relative to the target byte offset, encoded + in 1 to 3 bytes, with the sign bit in the least significant bit) + will be written. + That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) + and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...). + The source offset and data_length must be within the page size, or + else the record will be treated as corrupted. The data will be + copied from the page as it was at the start of the + mini-transaction. */ + MEMMOVE= 0x50, + /** Reserved for future use. */ + RESERVED= 0x60, + /** Optional record that may be ignored in crash recovery. + A subtype code will be encoded immediately after the length. + Possible subtypes would include a MDEV-18976 page checksum record, + a binlog record, or an SQL statement. */ + OPTION= 0x70 +}; + + +/** Supported EXTENDED record subtypes. */ +enum mrec_ext_t +{ + /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page, + including writing the "infimum" and "supremum" pseudo-records. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INIT_ROW_FORMAT_REDUNDANT= 0, + /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page, + including writing the "infimum" and "supremum" pseudo-records. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INIT_ROW_FORMAT_DYNAMIC= 1, + /** Initialize an undo log page. + This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + UNDO_INIT= 2, + /** Append a record to an undo log page. + This is equivalent to the old MLOG_UNDO_INSERT record. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + UNDO_APPEND= 3, + /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_HEAP_REDUNDANT= 4, + /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_REUSE_REDUNDANT= 5, + /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_HEAP_DYNAMIC= 6, + /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_REUSE_DYNAMIC= 7, + /** Delete a record on a ROW_FORMAT=REDUNDANT page. + We point to the precedessor of the record to be deleted. + The current byte offset will be reset to FIL_PAGE_TYPE. + This is similar to the old MLOG_REC_DELETE record. */ + DELETE_ROW_FORMAT_REDUNDANT= 8, + /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page. + We point to the precedessor of the record to be deleted + and include the total size of the record being deleted. + The current byte offset will be reset to FIL_PAGE_TYPE. + This is similar to the old MLOG_COMP_REC_DELETE record. */ + DELETE_ROW_FORMAT_DYNAMIC= 9, + /** Truncate a data file. */ + TRIM_PAGES= 10 +}; + + +/** Redo log record types for file-level operations. These bit +patterns will be written to redo log files, so the existing codes or +their interpretation on crash recovery must not be changed. */ +enum mfile_type_t +{ + /** Create a file. Followed by tablespace ID and the file name. */ + FILE_CREATE = 0x80, + /** Delete a file. Followed by tablespace ID and the file name. */ + FILE_DELETE = 0x90, + /** Rename a file. Followed by tablespace ID and the old file name, + NUL, and the new file name. */ + FILE_RENAME = 0xa0, + /** Modify a file. Followed by tablespace ID and the file name. */ + FILE_MODIFY = 0xb0, +#if 1 /* MDEV-14425 FIXME: Remove this! */ + /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier, + 8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */ + FILE_CHECKPOINT = 0xf0 +#endif +}; + +#if 1 /* MDEV-14425 FIXME: Remove this! */ +/** Size of a FILE_CHECKPOINT record, including the trailing byte to +terminate the mini-transaction. */ +constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1; +#endif + +#ifndef UNIV_INNOCHECKSUM +/** Types for the mlock objects to store in the mtr_t::m_memo */ +enum mtr_memo_type_t { + MTR_MEMO_PAGE_S_FIX = RW_S_LATCH, + + MTR_MEMO_PAGE_X_FIX = RW_X_LATCH, + + MTR_MEMO_PAGE_SX_FIX = RW_SX_LATCH, + + MTR_MEMO_BUF_FIX = RW_NO_LATCH, + + MTR_MEMO_MODIFY = 16, + + MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY, + MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY, + + MTR_MEMO_S_LOCK = RW_S_LATCH << 5, + + MTR_MEMO_X_LOCK = RW_X_LATCH << 5, + + MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5, + + /** acquire X-latch on fil_space_t::latch */ + MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1 +}; +#endif /* !UNIV_CHECKSUM */ + +#endif /* mtr0types_h */ diff --git a/storage/innobase/include/os0event.h b/storage/innobase/include/os0event.h new file mode 100644 index 00000000..52f6500a --- /dev/null +++ b/storage/innobase/include/os0event.h @@ -0,0 +1,131 @@ +/***************************************************************************** +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0event.h +The interface to the operating system condition variables + +Created 2012-09-23 Sunny Bains (split from os0sync.h) +*******************************************************/ + +#ifndef os0event_h +#define os0event_h + +#include "univ.i" + +// Forward declaration. +struct os_event; +typedef struct os_event* os_event_t; + +/** Denotes an infinite delay for os_event_wait_time() */ +#define OS_SYNC_INFINITE_TIME ULINT_UNDEFINED + +/** Return value of os_event_wait_time() when the time is exceeded */ +#define OS_SYNC_TIME_EXCEEDED 1 + +/** +Creates an event semaphore, i.e., a semaphore which may just have two states: +signaled and nonsignaled. The created event is manual reset: it must be reset +explicitly by calling os_event_reset(). +@return the event handle */ +os_event_t os_event_create(const char*); + +/** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +void +os_event_set( +/*=========*/ + os_event_t event); /*!< in/out: event to set */ + +/** +Check if the event is set. +@return true if set */ +bool +os_event_is_set( +/*============*/ + const os_event_t event); /*!< in: event to set */ + +/** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ +int64_t +os_event_reset( +/*===========*/ + os_event_t event); /*!< in/out: event to reset */ + +/** +Frees an event object. */ +void +os_event_destroy( +/*=============*/ + os_event_t& event); /*!< in/own: event to free */ + +/** +Waits for an event object until it is in the signaled state. + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +void +os_event_wait_low( +/*==============*/ + os_event_t event, /*!< in/out: event to wait */ + int64_t reset_sig_count);/*!< in: zero or the value + returned by previous call of + os_event_reset(). */ + +/** Blocking infinite wait on an event, until signealled. +@param e - event to wait on. */ +#define os_event_wait(e) os_event_wait_low((e), 0) + +/** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. In Unix the timeout is always infinite. +@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ +ulint +os_event_wait_time_low( +/*===================*/ + os_event_t event, /*!< in/out: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + int64_t reset_sig_count); /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ + +/** Blocking timed wait on an event. +@param e - event to wait on. +@param t - timeout in microseconds */ +#define os_event_wait_time(e, t) os_event_wait_time_low((e), (t), 0) + +#endif /* !os0event_h */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h new file mode 100644 index 00000000..9b5e5058 --- /dev/null +++ b/storage/innobase/include/os0file.h @@ -0,0 +1,1228 @@ +/*********************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/os0file.h +The interface to the operating system file io + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0file_h +#define os0file_h + +#include "fsp0types.h" +#include "tpool.h" +#include "my_counter.h" + +#ifndef _WIN32 +#include <dirent.h> +#include <sys/stat.h> +#include <time.h> +#endif /* !_WIN32 */ + +extern bool os_has_said_disk_full; + +/** File offset in bytes */ +typedef ib_uint64_t os_offset_t; + +#ifdef _WIN32 + +/** We define always WIN_ASYNC_IO, and check at run-time whether +the OS actually supports it: Win 95 does not, NT does. */ +# define WIN_ASYNC_IO + +/** Use unbuffered I/O */ +# define UNIV_NON_BUFFERED_IO + +/** File handle */ +typedef native_file_handle os_file_t; + + +#else /* _WIN32 */ + +/** File handle */ +typedef int os_file_t; + +#endif /* _WIN32 */ + +static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1); + +/** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */ +struct pfs_os_file_t +{ + /** Default constructor */ + pfs_os_file_t(os_file_t file = OS_FILE_CLOSED) : m_file(file) +#ifdef UNIV_PFS_IO + , m_psi(NULL) +#endif + {} + + /** The wrapped file handle */ + os_file_t m_file; +#ifdef UNIV_PFS_IO + /** PERFORMANCE_SCHEMA descriptor */ + struct PSI_file *m_psi; +#endif + /** Implicit type conversion. + @return the wrapped file handle */ + operator os_file_t() const { return m_file; } + /** Assignment operator. + @param[in] file file handle to be assigned */ + void operator=(os_file_t file) { m_file = file; } + bool operator==(os_file_t file) const { return m_file == file; } + bool operator!=(os_file_t file) const { return !(*this == file); } +#ifndef DBUG_OFF + friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){ + os << os_file_t(f); + return os; + } +#endif +}; + +/** The next value should be smaller or equal to the smallest sector size used +on any disk. A log block is required to be a portion of disk which is written +so that if the start and the end of a block get written to disk, then the +whole block gets written. This should be true even in most cases of a crash: +if this fails for a log block, then it is equivalent to a media failure in the +log. */ + +#define OS_FILE_LOG_BLOCK_SIZE 512U + +/** Options for os_file_create_func @{ */ +enum os_file_create_t { + OS_FILE_OPEN = 51, /*!< to open an existing file (if + doesn't exist, error) */ + OS_FILE_CREATE, /*!< to create new file (if + exists, error) */ + OS_FILE_OVERWRITE, /*!< to create a new file, if exists + the overwrite old file */ + OS_FILE_OPEN_RAW, /*!< to open a raw device or disk + partition */ + OS_FILE_CREATE_PATH, /*!< to create the directories */ + OS_FILE_OPEN_RETRY, /*!< open with retry */ + + /** Flags that can be combined with the above values. Please ensure + that the above values stay below 128. */ + + OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */ + OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to + the log unless it is a fatal error, + this flag is only used if + ON_ERROR_NO_EXIT is set */ +}; + +static const ulint OS_FILE_READ_ONLY = 333; +static const ulint OS_FILE_READ_WRITE = 444; + +/** Used by MySQLBackup */ +static const ulint OS_FILE_READ_ALLOW_DELETE = 555; + +/* Options for file_create */ +static const ulint OS_FILE_AIO = 61; +static const ulint OS_FILE_NORMAL = 62; +/* @} */ + +/** Types for file create @{ */ +static const ulint OS_DATA_FILE = 100; +static const ulint OS_LOG_FILE = 101; +static const ulint OS_DATA_FILE_NO_O_DIRECT = 103; +/* @} */ + +/** Error codes from os_file_get_last_error @{ */ +static const ulint OS_FILE_NAME_TOO_LONG = 36; +static const ulint OS_FILE_NOT_FOUND = 71; +static const ulint OS_FILE_DISK_FULL = 72; +static const ulint OS_FILE_ALREADY_EXISTS = 73; +static const ulint OS_FILE_PATH_ERROR = 74; + +/** wait for OS aio resources to become available again */ +static const ulint OS_FILE_AIO_RESOURCES_RESERVED = 75; + +static const ulint OS_FILE_SHARING_VIOLATION = 76; +static const ulint OS_FILE_ERROR_NOT_SPECIFIED = 77; +static const ulint OS_FILE_INSUFFICIENT_RESOURCE = 78; +static const ulint OS_FILE_AIO_INTERRUPTED = 79; +static const ulint OS_FILE_OPERATION_ABORTED = 80; +static const ulint OS_FILE_ACCESS_VIOLATION = 81; +static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125; +static const ulint OS_FILE_ERROR_MAX = 200; +/* @} */ + +/** +The I/O context that is passed down to the low level IO code */ +class IORequest +{ +public: + enum Type + { + /** Synchronous read */ + READ_SYNC= 2, + /** Asynchronous read; some errors will be ignored */ + READ_ASYNC= READ_SYNC | 1, + /** Possibly partial read; only used with + os_file_read_no_error_handling() */ + READ_MAYBE_PARTIAL= READ_SYNC | 4, + /** Read for doublewrite buffer recovery */ + DBLWR_RECOVER= READ_SYNC | 8, + /** Synchronous write */ + WRITE_SYNC= 16, + /** Asynchronous write */ + WRITE_ASYNC= WRITE_SYNC | 1, + /** A doublewrite batch */ + DBLWR_BATCH= WRITE_ASYNC | 8, + /** Write data; evict the block on write completion */ + WRITE_LRU= WRITE_ASYNC | 32, + /** Write data and punch hole for the rest */ + PUNCH= WRITE_ASYNC | 64, + /** Write data and punch hole; evict the block on write completion */ + PUNCH_LRU= PUNCH | WRITE_LRU, + /** Zero out a range of bytes in fil_space_t::io() */ + PUNCH_RANGE= WRITE_SYNC | 128, + }; + + constexpr IORequest(buf_page_t *bpage, fil_node_t *node, Type type) : + bpage(bpage), node(node), type(type) {} + + constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) : + bpage(bpage), type(type) {} + + bool is_read() const { return (type & READ_SYNC) != 0; } + bool is_write() const { return (type & WRITE_SYNC) != 0; } + bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; } + bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } + + /** If requested, free storage space associated with a section of the file. + @param off byte offset from the start (SEEK_SET) + @param len size of the hole in bytes + @return DB_SUCCESS or error code */ + dberr_t maybe_punch_hole(os_offset_t off, ulint len) + { + return off && len && node && (type & (PUNCH ^ WRITE_ASYNC)) + ? punch_hole(off, len) + : DB_SUCCESS; + } + +private: + /** Free storage space associated with a section of the file. + @param off byte offset from the start (SEEK_SET) + @param len size of the hole in bytes + @return DB_SUCCESS or error code */ + dberr_t punch_hole(os_offset_t off, ulint len) const; + +public: + /** Page to be written on write operation */ + buf_page_t* const bpage= nullptr; + + /** File descriptor */ + fil_node_t *const node= nullptr; + + /** Request type bit flags */ + const Type type; +}; + +constexpr IORequest IORequestRead(IORequest::READ_SYNC); +constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL); +constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC); + +/** Sparse file size information. */ +struct os_file_size_t { + /** Total size of file in bytes */ + os_offset_t m_total_size; + + /** If it is a sparse file then this is the number of bytes + actually allocated for the file. */ + os_offset_t m_alloc_size; +}; + +constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256; + +extern Atomic_counter<ulint> os_n_file_reads; +extern ulint os_n_file_writes; +extern ulint os_n_fsyncs; + +/* File types for directory entry data type */ + +enum os_file_type_t { + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK, /* symbolic link */ + OS_FILE_TYPE_BLOCK /* block device */ +}; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes. The maximum path length used by any storage engine +in the server must be at least this big. */ + +/* MySQL 5.7 my_global.h */ +#ifndef FN_REFLEN_SE +#define FN_REFLEN_SE 4000 +#endif + +#define OS_FILE_MAX_PATH 4000 +#if (FN_REFLEN_SE < OS_FILE_MAX_PATH) +# error "(FN_REFLEN_SE < OS_FILE_MAX_PATH)" +#endif + +/** Struct used in fetching information of a file in a directory */ +struct os_file_stat_t { + char name[OS_FILE_MAX_PATH]; /*!< path to a file */ + os_file_type_t type; /*!< file type */ + os_offset_t size; /*!< file size in bytes */ + os_offset_t alloc_size; /*!< Allocated size for + sparse files in bytes */ + size_t block_size; /*!< Block size to use for IO + in bytes*/ + time_t ctime; /*!< creation time */ + time_t mtime; /*!< modification time */ + time_t atime; /*!< access time */ + bool rw_perm; /*!< true if can be opened + in read-write mode. Only valid + if type == OS_FILE_TYPE_FILE */ +}; + +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the in the mysql server configuration +parameter (--tmpdir). +@return temporary file handle, or NULL on error */ +FILE* +os_file_create_tmpfile(); + +/** +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix, the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. + +@param[in] pathname directory name as null-terminated string +@param[in] fail_if_exists if true, pre-existing directory is treated + as an error. +@return true if call succeeds, false on error */ +bool +os_file_create_directory( + const char* pathname, + bool fail_if_exists); + +/** NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeed, false if error +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success); + +/** NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option + is used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_no_error_handling_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) + MY_ATTRIBUTE((warn_unused_result)); + +#ifdef _WIN32 +#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0) +#else +/** Tries to disable OS caching on an opened file descriptor. +@param[in] fd file descriptor to alter +@param[in] file_name file name, used in the diagnostic message +@param[in] name "open" or "create"; used in the diagnostic + message */ +void +os_file_set_nocache( +/*================*/ + int fd, /*!< in: file descriptor to alter */ + const char* file_name, + const char* operation_name); +#endif + +/** NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async I/O or unbuffered I/O: look in the + function source code for the exact rules +@param[in] type OS_DATA_FILE or OS_LOG_FILE +@param[in] read_only if true read only mode checks are enforced +@param[in] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_func( + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success) + MY_ATTRIBUTE((warn_unused_result)); + +/** Deletes a file. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@return true if success */ +bool +os_file_delete_func(const char* name); + +/** Deletes a file if it exists. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@param[out] exist indicate if file pre-exist +@return true if success */ +bool +os_file_delete_if_exists_func(const char* name, bool* exist); + +/** NOTE! Use the corresponding macro os_file_rename(), not directly +this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@return true if success */ +bool +os_file_rename_func(const char* oldpath, const char* newpath); + +/** NOTE! Use the corresponding macro os_file_close(), not directly this +function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@param[in] file own: handle to a file +@return true if success */ +bool os_file_close_func(os_file_t file); + +#ifdef UNIV_PFS_IO + +/* Keys to register InnoDB I/O with performance schema */ +extern mysql_pfs_key_t innodb_data_file_key; +extern mysql_pfs_key_t innodb_log_file_key; +extern mysql_pfs_key_t innodb_temp_file_key; + +/* Following four macros are instumentations to register +various file I/O operations with performance schema. +1) register_pfs_file_open_begin() and register_pfs_file_open_end() are +used to register file creation, opening, closing and renaming. +2) register_pfs_file_rename_begin() and register_pfs_file_rename_end() +are used to register file renaming +2) register_pfs_file_io_begin() and register_pfs_file_io_end() are +used to register actual file read, write and flush +3) register_pfs_file_close_begin() and register_pfs_file_close_end() +are used to register file deletion operations*/ +# define register_pfs_file_open_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (locker != NULL) { \ + PSI_FILE_CALL(start_file_open_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_open_end(locker, file, result) \ +do { \ + if (locker != NULL) { \ + file.m_psi = PSI_FILE_CALL(end_file_open_wait)( \ + locker, result); \ + } \ +} while (0) + +# define register_pfs_file_rename_begin(state, locker, key, op, name, \ + src_file, src_line) \ + register_pfs_file_open_begin(state, locker, key, op, name, \ + src_file, src_line) \ + +# define register_pfs_file_rename_end(locker, from, to, result) \ +do { \ + if (locker != NULL) { \ + PSI_FILE_CALL( \ + end_file_rename_wait)( \ + locker, from, to, result); \ + } \ +} while (0) + +# define register_pfs_file_close_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (locker != NULL) { \ + PSI_FILE_CALL(start_file_close_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_close_end(locker, result) \ +do { \ + if (locker != NULL) { \ + PSI_FILE_CALL(end_file_close_wait)( \ + locker, result); \ + } \ +} while (0) + +# define register_pfs_file_io_begin(state, locker, file, count, op, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_stream_locker)( \ + state, file.m_psi, op); \ + if (locker != NULL) { \ + PSI_FILE_CALL(start_file_wait)( \ + locker, count, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_io_end(locker, count) \ +do { \ + if (locker != NULL) { \ + PSI_FILE_CALL(end_file_wait)(locker, count); \ + } \ +} while (0) + +/* Following macros/functions are file I/O APIs that would be performance +schema instrumented if "UNIV_PFS_IO" is defined. They would point to +wrapper functions with performance schema instrumentation in such case. + +os_file_create +os_file_create_simple +os_file_create_simple_no_error_handling +os_file_close +os_file_rename +os_aio +os_file_read +os_file_read_no_error_handling +os_file_write + +The wrapper functions have the prefix of "innodb_". */ + +# define os_file_create(key, name, create, purpose, type, read_only, \ + success) \ + pfs_os_file_create_func(key, name, create, purpose, type, \ + read_only, success, __FILE__, __LINE__) + +# define os_file_create_simple(key, name, create, access, \ + read_only, success) \ + pfs_os_file_create_simple_func(key, name, create, access, \ + read_only, success, __FILE__, __LINE__) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, read_only, success) \ + pfs_os_file_create_simple_no_error_handling_func( \ + key, name, create_mode, access, \ + read_only, success, __FILE__, __LINE__) + +# define os_file_close(file) \ + pfs_os_file_close_func(file, __FILE__, __LINE__) + +# define os_file_read(type, file, buf, offset, n) \ + pfs_os_file_read_func(type, file, buf, offset, n, __FILE__, __LINE__) + +# define os_file_read_no_error_handling(type, file, buf, offset, n, o) \ + pfs_os_file_read_no_error_handling_func( \ + type, file, buf, offset, n, o, __FILE__, __LINE__) + +# define os_file_write(type, name, file, buf, offset, n) \ + pfs_os_file_write_func(type, name, file, buf, offset, \ + n, __FILE__, __LINE__) + +# define os_file_flush(file) \ + pfs_os_file_flush_func(file, __FILE__, __LINE__) + +# define os_file_rename(key, oldpath, newpath) \ + pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__) + +# define os_file_delete(key, name) \ + pfs_os_file_delete_func(key, name, __FILE__, __LINE__) + +# define os_file_delete_if_exists(key, name, exist) \ + pfs_os_file_delete_if_exists_func(key, name, exist, __FILE__, __LINE__) + +/** NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_no_error_handling_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async I/O or unbuffered I/O: look in the + function source code for the exact rules +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@param[in] file handle to a file +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_close_func( + pfs_os_file_t file, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_read_func( + const IORequest& type, + pfs_os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +This is the performance schema instrumented wrapper function for +os_file_read_no_error_handling_func() which requests a synchronous +read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[out] o number of bytes actually read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_read_no_error_handling_func( + const IORequest& type, + pfs_os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@param[in] type IO request context +@param[in] name Name of the file or path as NUL terminated + string +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_write_func( + const IORequest& type, + const char* name, + pfs_os_file_t file, + const void* buf, + os_offset_t offset, + ulint n, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +Flushes the write buffers of a given file to the disk. +@param[in] file Open file handle +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_flush_func( + pfs_os_file_t file, + const char* src_file, + uint src_line); + + +/** NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@param[in] key Performance Schema Key +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_rename_func( + mysql_pfs_key_t key, + const char* oldpath, + const char* newpath, + const char* src_file, + uint src_line); + +/** +NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( + mysql_pfs_key_t key, + const char* name, + const char* src_file, + uint src_line); + +/** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] exist indicate if file pre-exist +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( + mysql_pfs_key_t key, + const char* name, + bool* exist, + const char* src_file, + uint src_line); + +#else /* UNIV_PFS_IO */ + +/* If UNIV_PFS_IO is not defined, these I/O APIs point +to original un-instrumented file I/O APIs */ +# define os_file_create(key, name, create, purpose, type, read_only, \ + success) \ + os_file_create_func(name, create, purpose, type, read_only, \ + success) + +# define os_file_create_simple(key, name, create_mode, access, \ + read_only, success) \ + os_file_create_simple_func(name, create_mode, access, \ + read_only, success) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, read_only, success) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, read_only, success) + +# define os_file_close(file) os_file_close_func(file) + +# define os_file_read(type, file, buf, offset, n) \ + os_file_read_func(type, file, buf, offset, n) + +# define os_file_read_no_error_handling(type, file, buf, offset, n, o) \ + os_file_read_no_error_handling_func(type, file, buf, offset, n, o) + +# define os_file_write(type, name, file, buf, offset, n) \ + os_file_write_func(type, name, file, buf, offset, n) + +# define os_file_flush(file) os_file_flush_func(file) + +# define os_file_rename(key, oldpath, newpath) \ + os_file_rename_func(oldpath, newpath) + +# define os_file_delete(key, name) os_file_delete_func(name) + +# define os_file_delete_if_exists(key, name, exist) \ + os_file_delete_if_exists_func(name, exist) + +#endif /* UNIV_PFS_IO */ + +/** Gets a file size. +@param[in] file handle to a file +@return file size if OK, else set m_total_size to ~0 and m_alloc_size + to errno */ +os_file_size_t +os_file_get_size( + const char* filename) + MY_ATTRIBUTE((warn_unused_result)); + +/** Gets a file size. +@param[in] file handle to a file +@return file size, or (os_offset_t) -1 on failure */ +os_offset_t +os_file_get_size( + os_file_t file) + MY_ATTRIBUTE((warn_unused_result)); + +/** Extend a file. + +On Windows, extending a file allocates blocks for the file, +unless the file is sparse. + +On Unix, we will extend the file with ftruncate(), if +file needs to be sparse. Otherwise posix_fallocate() is used +when available, and if not, binary zeroes are added to the end +of file. + +@param[in] name file name +@param[in] file file handle +@param[in] size desired file size +@param[in] sparse whether to create a sparse file (no preallocating) +@return whether the operation succeeded */ +bool +os_file_set_size( + const char* name, + os_file_t file, + os_offset_t size, + bool is_sparse = false) + MY_ATTRIBUTE((warn_unused_result)); + +/** Truncates a file at its current position. +@param[in/out] file file to be truncated +@return true if success */ +bool +os_file_set_eof( + FILE* file); /*!< in: file to be truncated */ + +/** Truncate a file to a specified size in bytes. +@param[in] pathname file path +@param[in] file file to be truncated +@param[in] size size preserved in bytes +@param[in] allow_shrink whether to allow the file to become smaller +@return true if success */ +bool +os_file_truncate( + const char* pathname, + os_file_t file, + os_offset_t size, + bool allow_shrink = false); + +/** NOTE! Use the corresponding macro os_file_flush(), not directly this +function! +Flushes the write buffers of a given file to the disk. +@param[in] file handle to a file +@return true if success */ +bool +os_file_flush_func( + os_file_t file); + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@param[in] report true if we want an error message printed + for all errors +@return error number, or OS error number + 100 */ +ulint +os_file_get_last_error( + bool report); + +/** NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@return DB_SUCCESS if request was successful */ +dberr_t +os_file_read_func( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n) + MY_ATTRIBUTE((warn_unused_result)); + +/** Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. +@param[in,out] file file to read from +@param[in,out] str buffer where to read +@param[in] size size of buffer */ +void +os_file_read_string( + FILE* file, + char* str, + ulint size); + +/** NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[out] o number of bytes actually read +@return DB_SUCCESS or error code */ +dberr_t +os_file_read_no_error_handling_func( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Use the corresponding macro os_file_write(), not directly this +function! +Requests a synchronous write operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@return DB_SUCCESS if request was successful */ +dberr_t +os_file_write_func( + const IORequest& type, + const char* name, + os_file_t file, + const void* buf, + os_offset_t offset, + ulint n) + MY_ATTRIBUTE((warn_unused_result)); + +/** Check the existence and type of the given file. +@param[in] path pathname of the file +@param[out] exists true if file exists +@param[out] type type of the file (if it exists) +@return true if call succeeded */ +bool +os_file_status( + const char* path, + bool* exists, + os_file_type_t* type); + +/** This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@param[in] old_path pathname +@param[in] new_name new file name +@return own: new full pathname */ +char* +os_file_make_new_pathname( + const char* old_path, + const char* new_name); + +/** This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. +@param[in,out] data_dir_path Full path/data_dir_path */ +void +os_file_make_data_dir_path( + char* data_dir_path); + +/** Create all missing subdirectories along the given path. +@return DB_SUCCESS if OK, otherwise error code. */ +dberr_t +os_file_create_subdirs_if_needed( + const char* path); + +#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR +/* Test the function os_file_get_parent_dir. */ +void +unit_test_os_file_get_parent_dir(); +#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ + +/** +Initializes the asynchronous io system. */ +int os_aio_init(); + +/** +Frees the asynchronous io system. */ +void os_aio_free(); + +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n); + +/** Wait until there are no pending asynchronous writes. +Only used on FLUSH TABLES...FOR EXPORT. */ +void os_aio_wait_until_no_pending_writes(); + + +/** Prints info of the aio arrays. +@param[in/out] file file where to print */ +void +os_aio_print(FILE* file); + +/** Refreshes the statistics used to print per-second averages. */ +void +os_aio_refresh_stats(); + +/** Checks that all slots in the system have been freed, that is, there are +no pending io operations. */ +bool +os_aio_all_slots_free(); + + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[in] stat_info information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only if true read only mode checks are enforced +@return DB_SUCCESS if all OK */ +dberr_t +os_file_get_status( + const char* path, + os_file_stat_t* stat_info, + bool check_rw_perm, + bool read_only); + +/** Set the file create umask +@param[in] umask The umask to use for file creation. */ +void +os_file_set_umask(ulint umask); + +#ifdef _WIN32 + +/** +Make file sparse, on Windows. + +@param[in] file file handle +@param[in] is_sparse if true, make file sparse, + otherwise "unsparse" the file +@return true on success, false on error */ +bool os_file_set_sparse_win32(os_file_t file, bool is_sparse = true); + +/** +Changes file size on Windows + +If file is extended, following happens the bytes between +old and new EOF are zeros. + +If file is sparse, "virtual" block is added at the end of +allocated area. + +If file is normal, file system allocates storage. + +@param[in] pathname file path +@param[in] file file handle +@param[in] size size to preserve in bytes +@return true if success */ +bool +os_file_change_size_win32( + const char* pathname, + os_file_t file, + os_offset_t size); + +#endif /*_WIN32 */ + +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return DB_SUCCESS or error code */ +dberr_t +os_file_punch_hole( + os_file_t fh, + os_offset_t off, + os_offset_t len) + MY_ATTRIBUTE((warn_unused_result)); + +/** Normalizes a directory path for the current OS: +On Windows, we convert '/' to '\', else we convert '\' to '/'. +@param[in,out] str A null-terminated directory and file path */ +void os_normalize_path(char* str); + +/* Determine if a path is an absolute path or not. +@param[in] OS directory or file path to evaluate +@retval true if an absolute path +@retval false if a relative path */ +UNIV_INLINE +bool +is_absolute_path( + const char* path) +{ + if (path[0] == OS_PATH_SEPARATOR) { + return(true); + } + +#ifdef _WIN32 + if (path[1] == ':' && path[2] == OS_PATH_SEPARATOR) { + return(true); + } +#endif /* _WIN32 */ + + return(false); +} + +#include "os0file.ic" + +#endif /* os0file_h */ diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic new file mode 100644 index 00000000..e88f94b8 --- /dev/null +++ b/storage/innobase/include/os0file.ic @@ -0,0 +1,450 @@ +/***************************************************************************** + +Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0file.ic +The interface to the operating system file io + +Created 2/20/2010 Jimmy Yang +*******************************************************/ + +#ifdef UNIV_PFS_IO +/** NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin( + &state, locker, key, + (create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE : PSI_FILE_OPEN, + name, src_file, src_line); + + pfs_os_file_t file = os_file_create_simple_func( + name, create_mode, access_type, read_only, success); + + /* Register psi value for the file */ + register_pfs_file_open_end(locker, file, + (*success == TRUE ? success : 0)); + + return(file); +} + +/** NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_no_error_handling_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin( + &state, locker, key, + create_mode == OS_FILE_CREATE + ? PSI_FILE_CREATE : PSI_FILE_OPEN, + name, src_file, src_line); + + pfs_os_file_t file = os_file_create_simple_no_error_handling_func( + name, create_mode, access_type, read_only, success); + + register_pfs_file_open_end(locker, file, + (*success == TRUE ? success : 0)); + + return(file); +} + +/** NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really us + async I/O or unbuffered I/O: look in the + function source code for the exact rules +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin( + &state, locker, key, + create_mode == OS_FILE_CREATE + ? PSI_FILE_CREATE : PSI_FILE_OPEN, + name, src_file, src_line); + + pfs_os_file_t file = os_file_create_func( + name, create_mode, purpose, type, read_only, success); + + register_pfs_file_open_end(locker, file, + (*success == TRUE ? success : 0)); + + return(file); +} +/** +NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@param[in] file handle to a file +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_close_func( + pfs_os_file_t file, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register the file close */ + register_pfs_file_io_begin( + &state, locker, file, 0, PSI_FILE_CLOSE, src_file, src_line); + + bool result = os_file_close_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_read_func( + const IORequest& type, + pfs_os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, n, PSI_FILE_READ, src_file, src_line); + + dberr_t result; + + result = os_file_read_func(type, file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +This is the performance schema instrumented wrapper function for +os_file_read_no_error_handling_func() which requests a synchronous +read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[out] o number of bytes actually read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_read_no_error_handling_func( + const IORequest& type, + pfs_os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, n, PSI_FILE_READ, src_file, src_line); + + dberr_t result = os_file_read_no_error_handling_func( + type, file, buf, offset, n, o); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@param[in] type IO request context +@param[in] name Name of the file or path as NUL terminated + string +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return error code +@retval DB_SUCCESS if the request was successfully fulfilled */ +UNIV_INLINE +dberr_t +pfs_os_file_write_func( + const IORequest& type, + const char* name, + pfs_os_file_t file, + const void* buf, + os_offset_t offset, + ulint n, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, n, PSI_FILE_WRITE, src_file, src_line); + + dberr_t result; + + result = os_file_write_func(type, name, file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + + +/** NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +Flushes the write buffers of a given file to the disk. +@param[in] file Open file handle +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_flush_func( + pfs_os_file_t file, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line); + + bool result = os_file_flush_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@param[in] key Performance Schema Key +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_rename_func( + mysql_pfs_key_t key, + const char* oldpath, + const char* newpath, + const char* src_file, + uint src_line) + +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_rename_begin( + &state, locker, key, PSI_FILE_RENAME, newpath, + src_file, src_line); + + bool result = os_file_rename_func(oldpath, newpath); + + register_pfs_file_rename_end(locker, oldpath, newpath, !result); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( + mysql_pfs_key_t key, + const char* name, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_close_begin( + &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line); + + bool result = os_file_delete_func(name); + + register_pfs_file_close_end(locker, 0); + + return(result); +} + +/** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] exist indicate if file pre-exist +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( + mysql_pfs_key_t key, + const char* name, + bool* exist, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_close_begin( + &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line); + + bool result = os_file_delete_if_exists_func(name, exist); + + register_pfs_file_close_end(locker, 0); + + return(result); +} +#endif /* UNIV_PFS_IO */ diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h new file mode 100644 index 00000000..ed989045 --- /dev/null +++ b/storage/innobase/include/os0thread.h @@ -0,0 +1,98 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0thread.h +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0thread_h +#define os0thread_h + +#include "univ.i" + +/* Possible fixed priorities for threads */ +#define OS_THREAD_PRIORITY_NONE 100 +#define OS_THREAD_PRIORITY_BACKGROUND 1 +#define OS_THREAD_PRIORITY_NORMAL 2 +#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3 + +#ifdef _WIN32 +typedef DWORD os_thread_t; +typedef DWORD os_thread_id_t; /*!< In Windows the thread id + is an unsigned long int */ +extern "C" { +typedef LPTHREAD_START_ROUTINE os_thread_func_t; +} + +/** Macro for specifying a Windows thread start function. */ +#define DECLARE_THREAD(func) WINAPI func +#else + +typedef pthread_t os_thread_t; +typedef pthread_t os_thread_id_t; /*!< In Unix we use the thread + handle itself as the id of + the thread */ +extern "C" { typedef void* (*os_thread_func_t)(void*); } + +/** Macro for specifying a POSIX thread start function. */ +#define DECLARE_THREAD(func) func +#endif /* _WIN32 */ + +/* Define a function pointer type to use in a typecast */ +typedef void* (*os_posix_f_t) (void*); + +#ifdef HAVE_PSI_INTERFACE +/* Define for performance schema registration key */ +typedef unsigned int mysql_pfs_key_t; +#endif /* HAVE_PSI_INTERFACE */ + +#ifndef _WIN32 +#define os_thread_eq(a,b) pthread_equal(a, b) +#define os_thread_yield() sched_yield() +#define os_thread_get_curr_id() pthread_self() +#else +bool os_thread_eq(os_thread_id_t a, os_thread_id_t b); +void os_thread_yield(); +os_thread_id_t os_thread_get_curr_id(); +#endif + +/****************************************************************//** +Creates a new thread of execution. The execution starts from +the function given. +NOTE: We count the number of threads in os_thread_exit(). A created +thread should always use that to exit so thatthe thread count will be +decremented. +We do not return an error code because if there is one, we crash here. */ +os_thread_t os_thread_create(os_thread_func_t func, void *arg= nullptr); + +/** Detach and terminate the current thread. */ +ATTRIBUTE_NORETURN void os_thread_exit(); + +/*****************************************************************//** +The thread sleeps at least the time given in microseconds. */ +void +os_thread_sleep( +/*============*/ + ulint tm); /*!< in: time in microseconds */ + +#endif diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h new file mode 100644 index 00000000..c0f3bf68 --- /dev/null +++ b/storage/innobase/include/page0cur.h @@ -0,0 +1,350 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.h +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef page0cur_h +#define page0cur_h + +#include "page0page.h" + +#ifdef UNIV_DEBUG +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets the record where the cursor is positioned. +@return record */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + page_cur_t* cur); /*!< in: page cursor */ +#else /* UNIV_DEBUG */ +# define page_cur_get_page(cur) page_align((cur)->rec) +# define page_cur_get_block(cur) (cur)->block +# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block) +# define page_cur_get_rec(cur) (cur)->rec +#endif /* UNIV_DEBUG */ +# define is_page_cur_get_page_zip(cur) is_buf_block_get_page_zip((cur)->block) +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur); /*!< in: cursor */ +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur); /*!< out: page cursor */ +/**********************************************************//** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur); /*!< in/out: cursor; must not be after last */ +/**********************************************************//** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur); /*!< in/out: cursor; not before first */ + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + dict_index_t* index, /*!< in: record descriptor */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. +@return pointer to record if succeed, NULL otherwise */ +rec_t* +page_cur_insert_rec_low( +/*====================*/ + const page_cur_t*cur, /*!< in: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: record to insert after cur */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the +next record after the deleted one. */ +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const rec_offs* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was +written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM +@param enc_hdr encoded fixed-size header bits +@param hdr_c number of common record header bytes with prev +@param data_c number of common data bytes with prev +@param data literal header and data bytes +@param data_len length of the literal data, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_insert_redundant(const buf_block_t &block, bool reuse, + ulint prev, ulint enc_hdr, + size_t hdr_c, size_t data_c, + const void *data, size_t data_len); + +/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was +written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM +@param shift unless !reuse: number of bytes the PAGE_FREE is moving +@param enc_hdr_l number of copied record header bytes, plus record type bits +@param hdr_c number of common record header bytes with prev +@param data_c number of common data bytes with prev +@param data literal header and data bytes +@param data_len length of the literal data, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse, + ulint prev, ulint shift, ulint enc_hdr_l, + size_t hdr_c, size_t data_c, + const void *data, size_t data_len); + +/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by +page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page. +@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT +@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_delete_redundant(const buf_block_t &block, ulint prev); + +/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by +page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM +@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES +@param data_size data payload size, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev, + size_t hdr_size, size_t data_size); + +/** Search the right position for a page cursor. +@param[in] block buffer block +@param[in] index index tree +@param[in] tuple data tuple +@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE +@param[out] cursor page cursor +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + page_cur_mode_t mode, + page_cur_t* cursor); + +/** Search the right position for a page cursor. +@param[in] block buffer block +@param[in] index index tree +@param[in] tuple data tuple +@param[out] cursor page cursor +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + page_cur_t* cursor); + +/****************************************************************//** +Searches the right position for a page cursor. */ +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + page_cur_t* cursor, /*!< out: page cursor */ + rtr_info_t* rtr_info);/*!< in/out: rtree search stack */ +#ifdef BTR_CUR_HASH_ADAPT +/** Search the right position for a page cursor. +@param[in] block buffer block +@param[in] index index tree +@param[in] tuple key to be searched for +@param[in] mode search mode +@param[in,out] iup_matched_fields already matched fields in the +upper limit record +@param[in,out] iup_matched_bytes already matched bytes in the +first partially matched field in the upper limit record +@param[in,out] ilow_matched_fields already matched fields in the +lower limit record +@param[in,out] ilow_matched_bytes already matched bytes in the +first partially matched field in the lower limit record +@param[out] cursor page cursor */ +void +page_cur_search_with_match_bytes( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + page_cur_mode_t mode, + ulint* iup_matched_fields, + ulint* iup_matched_bytes, + ulint* ilow_matched_fields, + ulint* ilow_matched_bytes, + page_cur_t* cursor); +#endif /* BTR_CUR_HASH_ADAPT */ +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /*!< in: page */ + page_cur_t* cursor);/*!< out: page cursor */ + +/** Index page cursor */ + +struct page_cur_t{ + const dict_index_t* index; + rec_t* rec; /*!< pointer to a record on page */ + rec_offs* offsets; + buf_block_t* block; /*!< pointer to the block containing rec */ +}; + +#include "page0cur.ic" + +#endif diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic new file mode 100644 index 00000000..828be684 --- /dev/null +++ b/storage/innobase/include/page0cur.ic @@ -0,0 +1,291 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.ic +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + + if (cur->rec) { + ut_ad(page_align(cur->rec) == cur->block->frame); + } + + return(page_align(cur->rec)); +} + +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + + if (cur->rec) { + ut_ad(page_align(cur->rec) == cur->block->frame); + } + + return(cur->block); +} + +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + return(buf_block_get_page_zip(page_cur_get_block(cur))); +} + +/*********************************************************//** +Gets the record where the cursor is positioned. +@return record */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + + if (cur->rec) { + ut_ad(page_align(cur->rec) == cur->block->frame); + } + + return(cur->rec); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_infimum(cur->rec)); +} + +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_supremum(cur->rec)); +} + +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur) /*!< out: page cursor */ +{ + ut_ad(rec && block && cur); + ut_ad(page_align(rec) == block->frame); + + cur->rec = (rec_t*) rec; + cur->block = (buf_block_t*) block; +} + +/**********************************************************//** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur) /*!< in/out: cursor; must not be after last */ +{ + ut_ad(!page_cur_is_after_last(cur)); + + cur->rec = page_rec_get_next(cur->rec); +} + +/**********************************************************//** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur) /*!< in/out: page cursor, not before first */ +{ + ut_ad(!page_cur_is_before_first(cur)); + + cur->rec = page_rec_get_prev(cur->rec); +} + +/** Search the right position for a page cursor. +@param[in] block buffer block +@param[in] index index tree +@param[in] tuple data tuple +@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE +@param[out] cursor page cursor +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + page_cur_mode_t mode, + page_cur_t* cursor) +{ + ulint low_match = 0; + ulint up_match = 0; + + ut_ad(dtuple_check_typed(tuple)); + + page_cur_search_with_match(block, index, tuple, mode, + &up_match, &low_match, cursor, NULL); + return(low_match); +} + +/** Search the right position for a page cursor. +@param[in] block buffer block +@param[in] index index tree +@param[in] tuple data tuple +@param[out] cursor page cursor +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + page_cur_t* cursor) +{ + return(page_cur_search(block, index, tuple, PAGE_CUR_LE, cursor)); +} + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + dict_index_t* index, /*!< in: record descriptor */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + rec_t* rec; + ulint size = rec_get_converted_size(index, tuple, n_ext); + + if (!*heap) { + *heap = mem_heap_create(size + + (4 + REC_OFFS_HEADER_SIZE + + dtuple_get_n_fields(tuple)) + * sizeof **offsets); + } + + rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size), + index, tuple, n_ext); + + *offsets = rec_get_offsets(rec, index, *offsets, + page_is_leaf(cursor->block->frame) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, heap); + ut_ad(size == rec_offs_size(*offsets)); + + if (is_buf_block_get_page_zip(cursor->block)) { + rec = page_cur_insert_rec_zip( + cursor, index, rec, *offsets, mtr); + } else { + rec = page_cur_insert_rec_low(cursor, + index, rec, *offsets, mtr); + } + + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets)); + return(rec); +} diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h new file mode 100644 index 00000000..a73b9e48 --- /dev/null +++ b/storage/innobase/include/page0page.h @@ -0,0 +1,1171 @@ +/***************************************************************************** +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0page_h +#define page0page_h + +#include "page0types.h" +#include "fsp0fsp.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "rem0rec.h" +#include "mach0data.h" +#ifndef UNIV_INNOCHECKSUM +#include "dict0dict.h" +#include "data0data.h" +#include "mtr0mtr.h" + +/* PAGE HEADER + =========== + +Index page header starts at the first offset left free by the FIL-module */ + +typedef byte page_header_t; +#endif /* !UNIV_INNOCHECKSUM */ + +#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this + offset */ +/*-----------------------------*/ +#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ +#define PAGE_HEAP_TOP 2 /* pointer to record heap top */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ +#define PAGE_FREE 6 /* pointer to start of page free record list */ +#define PAGE_GARBAGE 8 /* number of bytes in deleted records */ +#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or + 0 if this info has been reset by a delete, + for example */ + +/** This 10-bit field is usually 0. In B-tree index pages of +ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd +file was created in MySQL 4.1.0 or if the table resides in the system +tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14. +In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX. + +In ROW_FORMAT=COMPRESSED tables, this field is always 0, because +instant ADD COLUMN is not supported. + +In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is +always 0, except in the root page of the clustered index after instant +ADD COLUMN. + +Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT +and initialize the PAGE_INSTANT field to the original number of +fields in the clustered index (dict_index_t::n_core_fields). The most +significant bits are in the first byte, and the least significant 5 +bits are stored in the most significant 5 bits of PAGE_DIRECTION_B. + +These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if +instant ADD COLUMN was not committed. Changes to these page header fields +are not undo-logged, but changes to the hidden metadata record are. +If the server is killed and restarted, the page header fields could +remain set even though no metadata record is present. + +When the table becomes empty, the PAGE_INSTANT field and the +FIL_PAGE_TYPE can be reset and any metadata record be removed. */ +#define PAGE_INSTANT 12 + +/** last insert direction: PAGE_LEFT, .... +In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14, +this byte can be garbage. */ +#define PAGE_DIRECTION_B 13 +#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same + direction */ +#define PAGE_N_RECS 16 /* number of user records on the page */ +/** The largest DB_TRX_ID that may have modified a record on the page; +Defined only in secondary index leaf pages and in change buffer leaf pages. +Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */ +#define PAGE_MAX_TRX_ID 18 +/** The AUTO_INCREMENT value (on persistent clustered index root pages). */ +#define PAGE_ROOT_AUTO_INC PAGE_MAX_TRX_ID +#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page + header which are set in a page create */ +/*----*/ +#define PAGE_LEVEL 26 /* level of the node in an index tree; the + leaf level is the level 0. This field should + not be written to after page creation. */ +#define PAGE_INDEX_ID 28 /* index id where the page belongs. + This field should not be written to after + page creation. */ + +#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in + a B-tree: defined only on the root page of a + B-tree, but not in the root of an ibuf tree */ +#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF +#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF + /* in the place of PAGE_BTR_SEG_LEAF and _TOP + there is a free list base node if the page is + the root page of an ibuf tree, and at the same + place is the free list node if the page is in + a free list */ +#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE) + /* file segment header for the non-leaf pages + in a B-tree: defined only on the root page of + a B-tree, but not in the root of an ibuf + tree */ +/*----*/ +#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) + /* start of data on the page */ + +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) + /* offset of the page supremum record end on + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ +/*-----------------------------*/ + +/* Heap numbers */ +#define PAGE_HEAP_NO_INFIMUM 0U /* page infimum */ +#define PAGE_HEAP_NO_SUPREMUM 1U /* page supremum */ +#define PAGE_HEAP_NO_USER_LOW 2U /* first user record in + creation (insertion) order, + not necessarily collation order; + this record may have been deleted */ + +/* Directions of cursor movement (stored in PAGE_DIRECTION field) */ +constexpr uint16_t PAGE_LEFT= 1; +constexpr uint16_t PAGE_RIGHT= 2; +constexpr uint16_t PAGE_SAME_REC= 3; +constexpr uint16_t PAGE_SAME_PAGE= 4; +constexpr uint16_t PAGE_NO_DIRECTION= 5; + +#ifndef UNIV_INNOCHECKSUM + +/* PAGE DIRECTORY + ============== +*/ + +typedef byte page_dir_slot_t; + +/* Offset of the directory start down from the page end. We call the +slot with the highest file address directory start, as it points to +the first record in the list of records. */ +#define PAGE_DIR FIL_PAGE_DATA_END + +/* We define a slot in the page directory as two bytes */ +constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2; + +/* The offset of the physically lower end of the directory, counted from +page end, when the page is empty */ +#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE) + +/* The maximum and minimum number of records owned by a directory slot. The +number may drop below the minimum in the first and the last slot in the +directory. */ +#define PAGE_DIR_SLOT_MAX_N_OWNED 8 +#define PAGE_DIR_SLOT_MIN_N_OWNED 4 + +extern my_bool srv_immediate_scrub_data_uncompressed; +#endif /* UNIV_INNOCHECKSUM */ + +/** Get the start of a page frame. +@param[in] ptr pointer within a page frame +@return start of the page frame */ +MY_ATTRIBUTE((const)) +inline page_t* page_align(void *ptr) +{ + return my_assume_aligned<UNIV_PAGE_SIZE_MIN> + (reinterpret_cast<page_t*>(ut_align_down(ptr, srv_page_size))); +} +inline const page_t *page_align(const void *ptr) +{ + return page_align(const_cast<void*>(ptr)); +} + +/** Gets the byte offset within a page frame. +@param[in] ptr pointer within a page frame +@return offset from the start of the page */ +MY_ATTRIBUTE((const)) +inline uint16_t page_offset(const void* ptr) +{ + return static_cast<uint16_t>(ut_align_offset(ptr, srv_page_size)); +} + +/** Determine whether an index page is not in ROW_FORMAT=REDUNDANT. +@param[in] page index page +@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline +byte +page_is_comp(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return(page[PAGE_HEADER + PAGE_N_HEAP] & 0x80); +} + +/** Determine whether an index page is empty. +@param[in] page index page +@return whether the page is empty (PAGE_N_RECS = 0) */ +inline +bool +page_is_empty(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_N_RECS + + page); +} + +/** Determine whether an index page contains garbage. +@param[in] page index page +@return whether the page contains garbage (PAGE_GARBAGE is not 0) */ +inline +bool +page_has_garbage(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return *reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_GARBAGE + + page); +} + +/** Determine whether an B-tree or R-tree index page is a leaf page. +@param[in] page index page +@return true if the page is a leaf (PAGE_LEVEL = 0) */ +inline +bool +page_is_leaf(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_LEVEL + + page); +} + +#ifndef UNIV_INNOCHECKSUM +/** Determine whether an index page record is not in ROW_FORMAT=REDUNDANT. +@param[in] rec record in an index page frame (not a copy) +@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline +byte +page_rec_is_comp(const byte* rec) +{ + return(page_is_comp(page_align(rec))); +} + +# ifdef UNIV_DEBUG +/** Determine if the record is the metadata pseudo-record +in the clustered index. +@param[in] rec leaf page record on an index page +@return whether the record is the metadata pseudo-record */ +inline bool page_rec_is_metadata(const rec_t* rec) +{ + return rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG; +} +# endif /* UNIV_DEBUG */ + +/** Determine the offset of the infimum record on the page. +@param[in] page index page +@return offset of the infimum record in record list, relative from page */ +inline +unsigned +page_get_infimum_offset(const page_t* page) +{ + ut_ad(!page_offset(page)); + return page_is_comp(page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM; +} + +/** Determine the offset of the supremum record on the page. +@param[in] page index page +@return offset of the supremum record in record list, relative from page */ +inline +unsigned +page_get_supremum_offset(const page_t* page) +{ + ut_ad(!page_offset(page)); + return page_is_comp(page) ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM; +} + +/** Determine whether an index page record is a user record. +@param[in] offset record offset in the page +@retval true if a user record +@retval false if the infimum or supremum pseudo-record */ +inline +bool +page_rec_is_user_rec_low(ulint offset) +{ + compile_time_assert(PAGE_OLD_INFIMUM >= PAGE_NEW_INFIMUM); + compile_time_assert(PAGE_OLD_SUPREMUM >= PAGE_NEW_SUPREMUM); + compile_time_assert(PAGE_NEW_INFIMUM < PAGE_OLD_SUPREMUM); + compile_time_assert(PAGE_OLD_INFIMUM < PAGE_NEW_SUPREMUM); + compile_time_assert(PAGE_NEW_SUPREMUM < PAGE_OLD_SUPREMUM_END); + compile_time_assert(PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM_END); + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START); + + return(offset != PAGE_NEW_SUPREMUM + && offset != PAGE_NEW_INFIMUM + && offset != PAGE_OLD_INFIMUM + && offset != PAGE_OLD_SUPREMUM); +} + +/** Determine if a record is the supremum record on an index page. +@param[in] offset record offset in an index page +@return true if the supremum record */ +inline +bool +page_rec_is_supremum_low(ulint offset) +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START); + return(offset == PAGE_NEW_SUPREMUM || offset == PAGE_OLD_SUPREMUM); +} + +/** Determine if a record is the infimum record on an index page. +@param[in] offset record offset in an index page +@return true if the infimum record */ +inline +bool +page_rec_is_infimum_low(ulint offset) +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START); + return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM); +} + +/** Determine whether an B-tree or R-tree index record is in a leaf page. +@param[in] rec index record in an index page +@return true if the record is in a leaf page */ +inline +bool +page_rec_is_leaf(const page_t* rec) +{ + const page_t* page = page_align(rec); + ut_ad(ulint(rec - page) >= page_get_infimum_offset(page)); + bool leaf = page_is_leaf(page); + ut_ad(!page_rec_is_comp(rec) + || !page_rec_is_user_rec_low(ulint(rec - page)) + || leaf == !rec_get_node_ptr_flag(rec)); + return leaf; +} + +/** Determine whether an index page record is a user record. +@param[in] rec record in an index page +@return true if a user record */ +inline +bool +page_rec_is_user_rec(const rec_t* rec); + +/** Determine whether an index page record is the supremum record. +@param[in] rec record in an index page +@return true if the supremum record */ +inline +bool +page_rec_is_supremum(const rec_t* rec); + +/** Determine whether an index page record is the infimum record. +@param[in] rec record in an index page +@return true if the infimum record */ +inline +bool +page_rec_is_infimum(const rec_t* rec); + +/** Read PAGE_MAX_TRX_ID. +@param[in] page index page +@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline trx_id_t page_get_max_trx_id(const page_t *page) +{ + ut_ad(fil_page_index_page_check(page)); + static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment"); + const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID); + return mach_read_from_8(p); +} + +/** +Set the number of owned records. +@tparam compressed whether to update any ROW_FORMAT=COMPRESSED page as well +@param[in,out] block index page +@param[in,out] rec record in block.frame +@param[in] n_owned number of records skipped in the sparse page directory +@param[in] comp whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED +@param[in,out] mtr mini-transaction */ +template<bool compressed> +inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned, + bool comp, mtr_t *mtr) +{ + ut_ad(block->frame == page_align(rec)); + ut_ad(comp == (page_is_comp(block->frame) != 0)); + + if (page_zip_des_t *page_zip= compressed + ? buf_block_get_page_zip(block) : nullptr) + { + ut_ad(comp); + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (rec_get_status(rec) != REC_STATUS_SUPREMUM) + page_zip_rec_set_owned(block, rec, n_owned, mtr); + } + else + { + rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED; + mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) | + (n_owned << REC_N_OWNED_SHIFT)); + } +} + +/*************************************************************//** +Sets the max trx id field value. */ +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */ +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction */ + +/** Persist the AUTO_INCREMENT value on a clustered index root page. +@param[in,out] block clustered index root page +@param[in] autoinc next available AUTO_INCREMENT value +@param[in,out] mtr mini-transaction +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +page_set_autoinc( + buf_block_t* block, + ib_uint64_t autoinc, + mtr_t* mtr, + bool reset) + MY_ATTRIBUTE((nonnull)); + +/*************************************************************//** +Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM). +@return SPLIT SEQUENCE NUMBER */ +UNIV_INLINE +node_seq_t +page_get_ssn_id( +/*============*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Sets the RTREE SPLIT SEQUENCE NUMBER field value */ +UNIV_INLINE +void +page_set_ssn_id( +/*============*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + node_seq_t ssn_id, /*!< in: split sequence id */ + mtr_t* mtr); /*!< in/out: mini-transaction */ + +#endif /* !UNIV_INNOCHECKSUM */ +/** Read a page header field. */ +inline uint16_t page_header_get_field(const page_t *page, ulint field) +{ + ut_ad(field <= PAGE_INDEX_ID); + ut_ad(!(field & 1)); + return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page)); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +uint16_t +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ + MY_ATTRIBUTE((warn_unused_result)); + +/*************************************************************//** +Returns the pointer stored in the given header field, or NULL. */ +#define page_header_get_ptr(page, field) \ + (page_header_get_offs(page, field) \ + ? page + page_header_get_offs(page, field) : NULL) + +/** +Reset PAGE_LAST_INSERT. +@param[in,out] block file page +@param[in,out] mtr mini-transaction */ +inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) +#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*< in: page */ + ulint nth) /*!< in: nth record */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +uint32_t +page_get_page_no( +/*=============*/ + const page_t* page); /*!< in: page */ + +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +uint32_t +page_get_space_id( +/*==============*/ + const page_t* page); /*!< in: page */ + +/*************************************************************//** +Gets the number of user records on page (the infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +uint16_t +page_get_n_recs( +/*============*/ + const page_t* page); /*!< in: index page */ + +/***************************************************************//** +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). +@return number of records */ +ulint +page_rec_get_n_recs_before( +/*=======================*/ + const rec_t* rec); /*!< in: the physical record */ +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +uint16_t +page_dir_get_n_heap( +/*================*/ + const page_t* page); /*!< in: index page */ +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +uint16_t +page_dir_get_n_slots( +/*=================*/ + const page_t* page); /*!< in: index page */ +/** Gets the pointer to a directory slot. +@param n sparse directory slot number +@return pointer to the sparse directory slot */ +inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n) +{ + ut_ad(page_dir_get_n_slots(page) > n); + static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility"); + return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2); +} +inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n) +{ + return page_dir_get_nth_slot(const_cast<page_t*>(page), n); +} +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec); /*!< in: record */ +/** Get the record pointed to by a directory slot. +@param[in] slot directory slot +@return pointer to record */ +inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot) +{ + return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot)); +} +inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot) +{ + return page_dir_slot_get_rec(const_cast<rec_t*>(slot)); +} +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot); /*!< in: page directory slot */ +/************************************************************//** +Calculates the space reserved for directory slots of a given +number of records. The exact value is a fraction number +n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is +rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs); /*!< in: number of records */ +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number */ +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec); /*!< in: the physical record */ + +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec); /*!< in: the physical record */ +/** Determine whether a page has any siblings. +@param[in] page page frame +@return true if the page has any siblings */ +inline bool page_has_siblings(const page_t* page) +{ + compile_time_assert(!(FIL_PAGE_PREV % 8)); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + return *reinterpret_cast<const uint64_t*>(page + FIL_PAGE_PREV) + != ~uint64_t(0); +} + +/** Determine whether a page has a predecessor. +@param[in] page page frame +@return true if the page has a predecessor */ +inline bool page_has_prev(const page_t* page) +{ + return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_PREV) + != FIL_NULL; +} + +/** Determine whether a page has a successor. +@param[in] page page frame +@return true if the page has a successor */ +inline bool page_has_next(const page_t* page) +{ + return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_NEXT) + != FIL_NULL; +} + +/** Read the AUTO_INCREMENT value from a clustered index root page. +@param[in] page clustered index root page +@return the persisted AUTO_INCREMENT value */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline uint64_t page_get_autoinc(const page_t *page) +{ + ut_d(uint16_t page_type= fil_page_get_type(page)); + ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT); + ut_ad(!page_has_siblings(page)); + const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC); + return mach_read_from_8(p); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp); /*!< in: nonzero=compact page layout */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record, must not be page + infimum */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + rec_t* rec); /*!< in: pointer to record, + must not be page infimum */ + +/************************************************************//** +true if the record is the first user record on a page. +@return true if the first user record */ +UNIV_INLINE +bool +page_rec_is_first( +/*==============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +true if the record is the second user record on a page. +@return true if the second user record */ +UNIV_INLINE +bool +page_rec_is_second( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +true if the record is the last user record on a page. +@return true if the last user record */ +UNIV_INLINE +bool +page_rec_is_last( +/*=============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +true if distance between the records (measured in number of times we have to +move to the next record) is at most the specified value +@param[in] left_rec lefter record +@param[in] right_rec righter record +@param[in] val specified value to compare +@return true if the distance is smaller than the value */ +UNIV_INLINE +bool +page_rec_distance_is_at_most( +/*=========================*/ + const rec_t* left_rec, + const rec_t* right_rec, + ulint val) + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +true if the record is the second last user record on a page. +@return true if the second last user record */ +UNIV_INLINE +bool +page_rec_is_second_last( +/*====================*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap if page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((const)); +/************************************************************//** +Returns the sum of the sizes of the records in the record list +excluding the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +uint16_t +page_get_data_size( +/*===============*/ + const page_t* page); /*!< in: index page */ +/** Read the PAGE_DIRECTION field from a byte. +@param[in] ptr pointer to PAGE_DIRECTION_B +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_ptr_get_direction(const byte* ptr); + +/** Read the PAGE_DIRECTION field. +@param[in] page index page +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_get_direction(const page_t* page) +{ + return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page); +} + +/** Read the PAGE_INSTANT field. +@param[in] page index page +@return the value of the PAGE_INSTANT field */ +inline +uint16_t +page_get_instant(const page_t* page); + +/** Create an uncompressed index page. +@param[in,out] block buffer block +@param[in,out] mtr mini-transaction +@param[in] comp set unless ROW_FORMAT=REDUNDANT */ +void page_create(buf_block_t *block, mtr_t *mtr, bool comp); +/**********************************************************//** +Create a compressed B-tree index page. */ +void +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame + where the page is created */ + dict_index_t* index, /*!< in: the index of the + page */ + ulint level, /*!< in: the B-tree level of + the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr); /*!< in/out: mini-transaction + handle */ +/**********************************************************//** +Empty a previously created B-tree index page. */ +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(1,2))); +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr */ +/*************************************************************//** +Copies records from page to new_page, from the given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original successor of the infimum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Copies records from page to new_page, up to the given record, NOT +including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure (new_block will +be decompressed) */ +ibool +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull(1, 2, 4, 5))); +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure */ +ibool +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull(1, 2, 4, 5))); +/** Create an index page. +@param[in,out] block buffer block +@param[in] comp nonzero=compact page format */ +void page_create_low(const buf_block_t* block, bool comp); + +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets);/*!< in: record descriptor */ +# ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +Prints the info in a page header. */ +void +page_header_print( +/*==============*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn); /*!< in: print rn first and last records + in directory */ +# endif /* UNIV_BTR_PRINT */ +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ +#ifdef UNIV_DEBUG +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +void +page_check_dir( +/*===========*/ + const page_t* page); /*!< in: index page */ +#endif /* UNIV_DEBUG */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT=REDUNDANT */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +/** Check the consistency of an index page. +@param[in] page index page +@param[in] index B-tree or R-tree index +@return whether the page is valid */ +bool page_validate(const page_t* page, const dict_index_t* index) + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no);/*!< in: heap number */ +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ +const rec_t* +page_find_rec_max_not_deleted( + const page_t* page); + +#endif /* !UNIV_INNOCHECKSUM */ + +#include "page0page.ic" + +#endif diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic new file mode 100644 index 00000000..6514886d --- /dev/null +++ b/storage/innobase/include/page0page.ic @@ -0,0 +1,724 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.ic +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0page_ic +#define page0page_ic + +#ifndef UNIV_INNOCHECKSUM +#include "rem0cmp.h" +#include "mtr0log.h" +#include "page0zip.h" + +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(block); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(trx_id); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) { + + page_set_max_trx_id(block, page_zip, trx_id, mtr); + } +} + +/*************************************************************//** +Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM). +@return SPLIT SEQUENCE NUMBER */ +UNIV_INLINE +node_seq_t +page_get_ssn_id( +/*============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page); + + return(static_cast<node_seq_t>( + mach_read_from_8(page + FIL_RTREE_SPLIT_SEQ_NUM))); +} + +/*************************************************************//** +Sets the RTREE SPLIT SEQUENCE NUMBER field value */ +UNIV_INLINE +void +page_set_ssn_id( +/*============*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + node_seq_t ssn_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_zip || page_zip == &block->page.zip); + constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM; + byte *b= my_assume_aligned<2>(&block->frame[field]); + if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) && + UNIV_LIKELY_NULL(page_zip)) + memcpy_aligned<2>(&page_zip->data[field], b, 8); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +uint16_t +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ +{ + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + uint16_t offs = page_header_get_field(page, field); + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + return(offs); +} + + +/** +Reset PAGE_LAST_INSERT. +@param[in,out] block file page +@param[in,out] mtr mini-transaction */ +inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr) +{ + constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT; + byte *b= my_assume_aligned<2>(&block->frame[field]); + if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memset_aligned<2>(&block->page.zip.data[field], 0, 2); +} + +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + if (page_rec_is_comp(rec)) { + return(rec_get_heap_no_new(rec)); + } else { + return(rec_get_heap_no_old(rec)); + } +} + +/** Determine whether an index page record is a user record. +@param[in] rec record in an index page +@return true if a user record */ +inline +bool +page_rec_is_user_rec(const rec_t* rec) +{ + ut_ad(page_rec_check(rec)); + return(page_rec_is_user_rec_low(page_offset(rec))); +} + +/** Determine whether an index page record is the supremum record. +@param[in] rec record in an index page +@return true if the supremum record */ +inline +bool +page_rec_is_supremum(const rec_t* rec) +{ + ut_ad(page_rec_check(rec)); + return(page_rec_is_supremum_low(page_offset(rec))); +} + +/** Determine whether an index page record is the infimum record. +@param[in] rec record in an index page +@return true if the infimum record */ +inline +bool +page_rec_is_infimum(const rec_t* rec) +{ + ut_ad(page_rec_check(rec)); + return(page_rec_is_infimum_low(page_offset(rec))); +} + +/************************************************************//** +true if the record is the first user record on a page. +@return true if the first user record */ +UNIV_INLINE +bool +page_rec_is_first( +/*==============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ +{ + ut_ad(page_get_n_recs(page) > 0); + + return(page_rec_get_next_const(page_get_infimum_rec(page)) == rec); +} + +/************************************************************//** +true if the record is the second user record on a page. +@return true if the second user record */ +UNIV_INLINE +bool +page_rec_is_second( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ +{ + ut_ad(page_get_n_recs(page) > 1); + + return(page_rec_get_next_const( + page_rec_get_next_const(page_get_infimum_rec(page))) == rec); +} + +/************************************************************//** +true if the record is the last user record on a page. +@return true if the last user record */ +UNIV_INLINE +bool +page_rec_is_last( +/*=============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ +{ + ut_ad(page_get_n_recs(page) > 0); + + return(page_rec_get_next_const(rec) == page_get_supremum_rec(page)); +} + +/************************************************************//** +true if distance between the records (measured in number of times we have to +move to the next record) is at most the specified value */ +UNIV_INLINE +bool +page_rec_distance_is_at_most( +/*=========================*/ + const rec_t* left_rec, + const rec_t* right_rec, + ulint val) +{ + for (ulint i = 0; i <= val; i++) { + if (left_rec == right_rec) { + return (true); + } + left_rec = page_rec_get_next_const(left_rec); + } + return (false); +} + +/************************************************************//** +true if the record is the second last user record on a page. +@return true if the second last user record */ +UNIV_INLINE +bool +page_rec_is_second_last( +/*====================*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ +{ + ut_ad(page_get_n_recs(page) > 1); + ut_ad(!page_rec_is_last(rec, page)); + + return(page_rec_get_next_const( + page_rec_get_next_const(rec)) == page_get_supremum_rec(page)); +} + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + return((rec_t*) page_rec_get_nth_const(page, nth)); +} + +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ +{ + ulint middle = (ulint(page_get_n_recs(page)) + + PAGE_HEAP_NO_USER_LOW) / 2; + + return(page_rec_get_nth(page, middle)); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +uint32_t +page_get_page_no( +/*=============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET)); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +uint32_t +page_get_space_id( +/*==============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return mach_read_from_4(my_assume_aligned<2> + (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Gets the number of user records on page (infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +uint16_t +page_get_n_recs( +/*============*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_RECS)); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +uint16_t +page_dir_get_n_slots( +/*=================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); +} + +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +uint16_t +page_dir_get_n_heap( +/*================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec) /*!< in: record */ +{ + const page_t* page = page_align(rec); + + ut_a(rec); + + ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP)); + ut_a(page_offset(rec) >= PAGE_DATA); + + return(TRUE); +} + +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot) /*!< in: page directory slot */ +{ + const rec_t* rec = page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + return(rec_get_n_owned_new(rec)); + } else { + return(rec_get_n_owned_old(rec)); + } +} + +/************************************************************//** +Calculates the space reserved for directory slots of a given number of +records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE / +PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs) /*!< in: number of records */ +{ + return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1) + / PAGE_DIR_SLOT_MIN_N_OWNED); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + ulint offs; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + offs = rec_get_next_offs(rec, comp); + + if (offs >= srv_page_size) { + fprintf(stderr, + "InnoDB: Next record offset is nonsensical %lu" + " in record at offset %lu\n" + "InnoDB: rec address %p, space id %lu, page %lu\n", + (ulong) offs, (ulong) page_offset(rec), + (void*) rec, + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page)); + ut_error; + } else if (offs == 0) { + + return(NULL); + } + + ut_ad(page_rec_is_infimum(rec) + || (!page_is_leaf(page) && !page_has_prev(page)) + || !(rec_get_info_bits(page + offs, comp) + & REC_INFO_MIN_REC_FLAG)); + + return(page + offs); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec) /*!< in: pointer to record */ +{ + return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + return(page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + const rec_t* r; + ulint page_is_compact = page_rec_is_comp(rec); + + for (r = page_rec_get_next_const(rec); + !page_rec_is_supremum(r) + && rec_get_deleted_flag(r, page_is_compact); + r = page_rec_get_next_const(r)) { + /* noop */ + } + + return(r); +} + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + const rec_t* rec2; + const rec_t* prev_rec = NULL; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + ut_ad(!page_rec_is_infimum(rec)); + + slot_no = page_dir_find_owner_slot(rec); + + ut_a(slot_no != 0); + + slot = page_dir_get_nth_slot(page, slot_no - 1); + + rec2 = page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, TRUE); + } + } else { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, FALSE); + } + } + + ut_a(prev_rec); + + return(prev_rec); +} + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + return((rec_t*) page_rec_get_prev_const(rec)); +} + +#endif /* UNIV_INNOCHECKSUM */ + +/************************************************************//** +Returns the sum of the sizes of the records in the record list, excluding +the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +uint16_t +page_get_data_size( +/*===============*/ + const page_t* page) /*!< in: index page */ +{ + unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP) + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE); + ut_ad(ret < srv_page_size); + return static_cast<uint16_t>(ret); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + if (comp) { + return((ulint)(srv_page_size + - PAGE_NEW_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); + } + + return((ulint)(srv_page_size + - PAGE_OLD_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); +} + +/************************************************************//** +Each user record on a page, and also the deleted user records in the heap +takes its size plus the fraction of the dir cell size / +PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the +value of page_get_free_space_of_empty, the insert is impossible, otherwise +it is allowed. This function returns the maximum combined size of records +which can be inserted on top of the record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + if (page_is_comp(page)) { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_NEW_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(TRUE); + } else { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_OLD_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(FALSE); + } + + /* Above the 'n_recs +' part reserves directory space for the new + inserted records; the '- 2' excludes page infimum and supremum + records */ + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of the record heap if a page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + occupied = page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); + + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/** Read the PAGE_DIRECTION field from a byte. +@param[in] ptr pointer to PAGE_DIRECTION_B +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_ptr_get_direction(const byte* ptr) +{ + ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B); + return *ptr & ((1U << 3) - 1); +} + +/** Read the PAGE_INSTANT field. +@param[in] page index page +@return the value of the PAGE_INSTANT field */ +inline +uint16_t +page_get_instant(const page_t* page) +{ + uint16_t i = page_header_get_field(page, PAGE_INSTANT); +#ifdef UNIV_DEBUG + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION); + ut_ad(i >> 3); + break; + case FIL_PAGE_INDEX: + ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page)); + break; + case FIL_PAGE_RTREE: + ut_ad(i <= PAGE_NO_DIRECTION); + break; + default: + ut_ad("invalid page type" == 0); + break; + } +#endif /* UNIV_DEBUG */ + return static_cast<uint16_t>(i >> 3); /* i / 8 */ +} +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h new file mode 100644 index 00000000..6c5a681f --- /dev/null +++ b/storage/innobase/include/page0types.h @@ -0,0 +1,161 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0types.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0types_h +#define page0types_h + +#include "dict0types.h" +#include "mtr0types.h" +#include "rem0types.h" + +#include <map> + +/** Eliminates a name collision on HP-UX */ +#define page_t ib_page_t +/** Type of the index page */ +typedef byte page_t; +#ifndef UNIV_INNOCHECKSUM +/** Index page cursor */ +struct page_cur_t; +/** Buffer pool block */ +struct buf_block_t; + +/** Compressed index page */ +typedef byte page_zip_t; + +/* The following definitions would better belong to page0zip.h, +but we cannot include page0zip.h from rem0rec.ic, because +page0*.h includes rem0rec.h and may include rem0rec.ic. */ + +/** Number of bits needed for representing different compressed page sizes */ +#define PAGE_ZIP_SSIZE_BITS 3 + +/** Maximum compressed page shift size */ +#define PAGE_ZIP_SSIZE_MAX \ + (UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + +/* Make sure there are enough bits available to store the maximum zip +ssize, which is the number of shifts from 512. */ +#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS) +# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)" +#endif + +/* Page cursor search modes; the values must be in this order! */ +enum page_cur_mode_t { + PAGE_CUR_UNSUPP = 0, + PAGE_CUR_G = 1, + PAGE_CUR_GE = 2, + PAGE_CUR_L = 3, + PAGE_CUR_LE = 4, + +/* PAGE_CUR_LE_OR_EXTENDS = 5,*/ /* This is a search mode used in + "column LIKE 'abc%' ORDER BY column DESC"; + we have to find strings which are <= 'abc' or + which extend it */ + +/* These search mode is for search R-tree index. */ + PAGE_CUR_CONTAIN = 7, + PAGE_CUR_INTERSECT = 8, + PAGE_CUR_WITHIN = 9, + PAGE_CUR_DISJOINT = 10, + PAGE_CUR_MBR_EQUAL = 11, + PAGE_CUR_RTREE_INSERT = 12, + PAGE_CUR_RTREE_LOCATE = 13, + PAGE_CUR_RTREE_GET_FATHER = 14 +}; + +/** Compressed page descriptor */ +struct page_zip_des_t +{ + page_zip_t* data; /*!< compressed page data */ + +#ifdef UNIV_DEBUG + unsigned m_start:16; /*!< start offset of modification log */ + bool m_external; /*!< Allocated externally, not from the + buffer pool */ +#endif /* UNIV_DEBUG */ + unsigned m_end:16; /*!< end offset of modification log */ + unsigned m_nonempty:1; /*!< TRUE if the modification log + is not empty */ + unsigned n_blobs:12; /*!< number of externally stored + columns on the page; the maximum + is 744 on a 16 KiB page */ + unsigned ssize:PAGE_ZIP_SSIZE_BITS; + /*!< 0 or compressed page shift size; + the size in bytes is + (UNIV_ZIP_SIZE_MIN >> 1) << ssize. */ +}; + +/** Compression statistics for a given page size */ +struct page_zip_stat_t { + /** Number of page compressions */ + ulint compressed; + /** Number of successful page compressions */ + ulint compressed_ok; + /** Number of page decompressions */ + ulint decompressed; + /** Duration of page compressions in microseconds */ + ib_uint64_t compressed_usec; + /** Duration of page decompressions in microseconds */ + ib_uint64_t decompressed_usec; + page_zip_stat_t() : + /* Initialize members to 0 so that when we do + stlmap[key].compressed++ and element with "key" does not + exist it gets inserted with zeroed members. */ + compressed(0), + compressed_ok(0), + decompressed(0), + compressed_usec(0), + decompressed_usec(0) + { } +}; + +/** Compression statistics types */ +typedef std::map< + index_id_t, + page_zip_stat_t, + std::less<index_id_t>, + ut_allocator<std::pair<const index_id_t, page_zip_stat_t> > > + page_zip_stat_per_index_t; + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by dict_index_t::id */ +extern page_zip_stat_per_index_t page_zip_stat_per_index; + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +void +page_zip_rec_set_owned( +/*===================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag, /*!< in: the owned flag (nonzero=TRUE) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +#endif /* !UNIV_INNOCHECKSUM */ +#endif diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h new file mode 100644 index 00000000..5a70e995 --- /dev/null +++ b/storage/innobase/include/page0zip.h @@ -0,0 +1,392 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.h +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifndef page0zip_h +#define page0zip_h + +#include "buf0types.h" + +#ifndef UNIV_INNOCHECKSUM +#include "mtr0types.h" +#include "page0types.h" +#include "dict0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "mem0mem.h" + +/* Compression level to be used by zlib. Settable by user. */ +extern uint page_zip_level; + +/* Default compression level. */ +#define DEFAULT_COMPRESSION_LEVEL 6 +/** Start offset of the area that will be compressed */ +#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END +/** Size of an compressed page directory entry */ +#define PAGE_ZIP_DIR_SLOT_SIZE 2 +/** Predefine the sum of DIR_SLOT, TRX_ID & ROLL_PTR */ +#define PAGE_ZIP_CLUST_LEAF_SLOT_SIZE \ + (PAGE_ZIP_DIR_SLOT_SIZE \ + + DATA_TRX_ID_LEN \ + + DATA_ROLL_PTR_LEN) +/** Mask of record offsets */ +#define PAGE_ZIP_DIR_SLOT_MASK 0x3fffU +/** 'owned' flag */ +#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000U +/** 'deleted' flag */ +#define PAGE_ZIP_DIR_SLOT_DEL 0x8000U + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size); /*!< in: size in bytes */ + +/** Determine if a record is so big that it needs to be stored externally. +@param[in] rec_size length of the record in bytes +@param[in] comp nonzero=compact format +@param[in] n_fields number of fields in the record; ignored if +tablespace is not compressed +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) + MY_ATTRIBUTE((warn_unused_result)); + +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ + MY_ATTRIBUTE((const)); + +/** Check whether a tuple is too big for compressed table +@param[in] index dict index object +@param[in] entry entry for the index +@return true if it's too big, otherwise false */ +bool +page_zip_is_too_big( + const dict_index_t* index, + const dtuple_t* entry); + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip); /*!< in/out: compressed page + descriptor */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/** Attempt to compress a ROW_FORMAT=COMPRESSED page. +@retval true on success +@retval false on failure; block->page.zip will be left intact. */ +bool +page_zip_compress( + buf_block_t* block, /*!< in/out: buffer block */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: commpression level */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Write the index information for the compressed page. +@return used size of buf */ +ulint +page_zip_fields_encode( +/*===================*/ + ulint n, /*!< in: number of fields + to compress */ + const dict_index_t* index, /*!< in: index comprising + at least n fields */ + ulint trx_id_pos, + /*!< in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf); /*!< out: buffer of (n + 1) * 2 bytes */ + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ + MY_ATTRIBUTE((nonnull(1,2))); + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip); /*!< in: compressed page + descriptor */ +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ + MY_ATTRIBUTE((nonnull(1,2))); +/**********************************************************************//** +Check that the compressed and decompressed pages match. */ +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ + MY_ATTRIBUTE((nonnull(1,2))); +#endif /* UNIV_ZIP_DEBUG */ + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ + MY_ATTRIBUTE((warn_unused_result)); + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if page_zip_write_rec() will succeed */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Write an entire record to the ROW_FORMAT=COMPRESSED page. +The data must already have been written to the uncompressed page. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in] rec record in the uncompressed page +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] create nonzero=insert, zero=update +@param[in,out] mtr mini-transaction */ +void page_zip_write_rec(buf_block_t *block, const byte *rec, + const dict_index_t *index, const rec_offs *offsets, + ulint create, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +void +page_zip_write_blob_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +void +page_zip_write_node_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in,out] rec record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields) +@param[in] trx_id DB_TRX_ID value (transaction identifier) +@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer) +@param[in,out] mtr mini-transaction */ +void +page_zip_write_trx_id_and_roll_ptr( + buf_block_t* block, + byte* rec, + const rec_offs* offsets, + ulint trx_id_col, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); + +/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record. +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in] flag the value of the delete-mark flag +@param[in,out] mtr mini-transaction */ +void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Insert a record to the dense page directory. */ +void +page_zip_dir_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + uint16_t free_rec,/*!< in: record from which rec was + allocated, or 0 */ + byte* rec, /*!< in: record to insert */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(1,3,4))); + +/** Shift the dense page directory and the array of BLOB pointers +when a record is deleted. +@param[in,out] block index page +@param[in,out] rec record being deleted +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] free previous start of the free list +@param[in,out] mtr mini-transaction */ +void page_zip_dir_delete(buf_block_t *block, byte *rec, + const dict_index_t *index, const rec_offs *offsets, + const byte *free, mtr_t *mtr) + MY_ATTRIBUTE((nonnull(1,2,3,4,6))); + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, redo log will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@retval true on success +@retval false on failure; the block_zip will be left intact */ +bool +page_zip_reorganize( + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint z_level,/*!< in: compression level */ + mtr_t* mtr, /*!< in: mini-transaction */ + bool restore = false)/*!< whether to restore on failure */ + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +void +page_zip_copy_recs( + buf_block_t* block, /*!< in/out: buffer block */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr); /*!< in: mini-transaction */ +#endif /* !UNIV_INNOCHECKSUM */ + +/** Calculate the compressed page checksum. +@param[in] data compressed page +@param[in] size size of compressed page +@param[in] algo algorithm to use +@return page checksum */ +uint32_t +page_zip_calc_checksum( + const void* data, + ulint size, + srv_checksum_algorithm_t algo); + +/** Validate the checksum on a ROW_FORMAT=COMPRESSED page. +@param data ROW_FORMAT=COMPRESSED page +@param size size of the page, in bytes +@return whether the stored checksum matches innodb_checksum_algorithm */ +bool page_zip_verify_checksum(const byte *data, size_t size); + +#ifndef UNIV_INNOCHECKSUM +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index(); +/*===========================*/ + +#include "page0zip.ic" +#endif /* !UNIV_INNOCHECKSUM */ + +#endif /* page0zip_h */ diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic new file mode 100644 index 00000000..ede61283 --- /dev/null +++ b/storage/innobase/include/page0zip.ic @@ -0,0 +1,334 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.ic +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#include "page0page.h" + +/* The format of compressed pages is as follows. + +The header and trailer of the uncompressed pages, excluding the page +directory in the trailer, are copied as is to the header and trailer +of the compressed page. + +At the end of the compressed page, there is a dense page directory +pointing to every user record contained on the page, including deleted +records on the free list. The dense directory is indexed in the +collation order, i.e., in the order in which the record list is +linked on the uncompressed page. The infimum and supremum records are +excluded. The two most significant bits of the entries are allocated +for the delete-mark and an n_owned flag indicating the last record in +a chain of records pointed to from the sparse page directory on the +uncompressed page. + +The data between PAGE_ZIP_START and the last page directory entry will +be written in compressed format, starting at offset PAGE_DATA. +Infimum and supremum records are not stored. We exclude the +REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered +from the dense page directory stored at the end of the compressed +page. + +The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and +roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of +externally stored columns are stored separately, in ascending order of +heap_no and column index, starting backwards from the dense page +directory. + +The compressed data stream may be followed by a modification log +covering the compressed portion of the page, as follows. + +MODIFICATION LOG ENTRY FORMAT +- write record: + - (heap_no - 1) << 1 (1..2 bytes) + - extra bytes backwards + - data bytes +- clear record: + - (heap_no - 1) << 1 | 1 (1..2 bytes) + +The integer values are stored in a variable-length format: +- 0xxxxxxx: 0..127 +- 1xxxxxxx xxxxxxxx: 0..32767 + +The end of the modification log is marked by a 0 byte. + +In summary, the compressed page looks like this: + +(1) Uncompressed page header (PAGE_DATA bytes) +(2) Compressed index information +(3) Compressed page data +(4) Page modification log (page_zip->m_start..page_zip->m_end) +(5) Empty zero-filled space +(6) BLOB pointers (on leaf pages) + - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column + - in descending collation order +(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, + - indexed by heap_no + - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes + - REC_NODE_PTR_SIZE for non-leaf pages + - 0 otherwise +(8) dense page directory, stored backwards + - n_dense = n_heap - 2 + - existing records in ascending collation order + - deleted records (free list) in link order +*/ + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size; + + if (!page_zip->ssize) { + return(0); + } + + size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; + + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= srv_page_size); + + return(size); +} +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size) /*!< in: size in bytes */ +{ + if (size) { + unsigned ssize; + + ut_ad(ut_is_2pow(size)); + + for (ssize = 1; size > (512U << ssize); ssize++) { + } + + page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1); + } else { + page_zip->ssize = 0; + } + + ut_ad(page_zip_get_size(page_zip) == size); +} + +/** Determine if a record is so big that it needs to be stored externally. +@param[in] rec_size length of the record in bytes +@param[in] comp nonzero=compact format +@param[in] n_fields number of fields in the record; ignored if +tablespace is not compressed +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) +{ + /* FIXME: row size check is this function seems to be the most correct. + Put it in a separate function and use in more places of InnoDB */ + + ut_ad(rec_size + > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); + ut_ad(comp || !zip_size); + +#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE + if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE : + rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) { + return(TRUE); + } +#endif + + if (zip_size) { + ut_ad(comp); + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. There should be enough room for + one record on an empty leaf page. Subtract 1 byte for + the encoded heap number. Check also the available space + on the uncompressed page. */ + return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) + >= page_zip_empty_size(n_fields, zip_size) + || rec_size >= page_get_free_space_of_empty(TRUE) / 2); + } + + return(rec_size >= page_get_free_space_of_empty(comp) / 2); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */ +{ + ut_ad(page_zip); + ut_ad(page_zip->data); + ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); + ut_ad(page_zip->m_start <= page_zip->m_end); + ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); + ut_ad(page_zip->n_blobs + < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Determine if the length of the page trailer. +@return length of the page trailer, in bytes, not including the +terminating zero byte of the modification log */ +UNIV_INLINE +ibool +page_zip_get_trailer_len( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint uncompressed_size; + + ut_ad(page_zip_simple_validate(page_zip)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + if (!page_is_leaf(page_zip->data)) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE; + ut_ad(!page_zip->n_blobs); + } else if (is_clust) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; + ut_ad(!page_zip->n_blobs); + } + + return (ulint(page_dir_get_n_heap(page_zip->data)) - 2) + * uncompressed_size + + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE; +} + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint trailer_len; + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + + return(lint(page_zip_get_size(page_zip) + - trailer_len - page_zip->m_end + - (REC_N_NEW_EXTRA_BYTES - 2))); +} + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if enough space is available */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ +{ + ulint trailer_len; + + ut_ad(length > REC_N_NEW_EXTRA_BYTES); + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* Subtract the fixed extra bytes and add the maximum + space needed for identifying the record (encoded heap_no). */ + length -= REC_N_NEW_EXTRA_BYTES - 2; + + if (create > 0) { + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + } + + return(length + trailer_len + page_zip->m_end + < page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip) /*!< in/out: compressed page + descriptor */ +{ + memset(page_zip, 0, sizeof *page_zip); +} + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index() +/*===========================*/ +{ + mutex_enter(&page_zip_stat_per_index_mutex); + + page_zip_stat_per_index.erase( + page_zip_stat_per_index.begin(), + page_zip_stat_per_index.end()); + + mutex_exit(&page_zip_stat_per_index_mutex); +} diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h new file mode 100644 index 00000000..58d424ab --- /dev/null +++ b/storage/innobase/include/pars0grm.h @@ -0,0 +1,145 @@ +/* A Bison parser, made by GNU Bison 3.4.2. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2019 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* Undocumented macros, especially those whose name start with YY_, + are private implementation details. Do not rely on them. */ + +#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED +# define YY_YY_PARS0GRM_TAB_H_INCLUDED +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int yydebug; +#endif + +/* Token type. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + PARS_INT_LIT = 258, + PARS_FLOAT_LIT = 259, + PARS_STR_LIT = 260, + PARS_NULL_LIT = 261, + PARS_ID_TOKEN = 262, + PARS_AND_TOKEN = 263, + PARS_OR_TOKEN = 264, + PARS_NOT_TOKEN = 265, + PARS_GE_TOKEN = 266, + PARS_LE_TOKEN = 267, + PARS_NE_TOKEN = 268, + PARS_PROCEDURE_TOKEN = 269, + PARS_IN_TOKEN = 270, + PARS_INT_TOKEN = 271, + PARS_CHAR_TOKEN = 272, + PARS_IS_TOKEN = 273, + PARS_BEGIN_TOKEN = 274, + PARS_END_TOKEN = 275, + PARS_IF_TOKEN = 276, + PARS_THEN_TOKEN = 277, + PARS_ELSE_TOKEN = 278, + PARS_ELSIF_TOKEN = 279, + PARS_LOOP_TOKEN = 280, + PARS_WHILE_TOKEN = 281, + PARS_RETURN_TOKEN = 282, + PARS_SELECT_TOKEN = 283, + PARS_COUNT_TOKEN = 284, + PARS_FROM_TOKEN = 285, + PARS_WHERE_TOKEN = 286, + PARS_FOR_TOKEN = 287, + PARS_DDOT_TOKEN = 288, + PARS_ORDER_TOKEN = 289, + PARS_BY_TOKEN = 290, + PARS_ASC_TOKEN = 291, + PARS_DESC_TOKEN = 292, + PARS_INSERT_TOKEN = 293, + PARS_INTO_TOKEN = 294, + PARS_VALUES_TOKEN = 295, + PARS_UPDATE_TOKEN = 296, + PARS_SET_TOKEN = 297, + PARS_DELETE_TOKEN = 298, + PARS_CURRENT_TOKEN = 299, + PARS_OF_TOKEN = 300, + PARS_CREATE_TOKEN = 301, + PARS_TABLE_TOKEN = 302, + PARS_INDEX_TOKEN = 303, + PARS_UNIQUE_TOKEN = 304, + PARS_CLUSTERED_TOKEN = 305, + PARS_ON_TOKEN = 306, + PARS_ASSIGN_TOKEN = 307, + PARS_DECLARE_TOKEN = 308, + PARS_CURSOR_TOKEN = 309, + PARS_SQL_TOKEN = 310, + PARS_OPEN_TOKEN = 311, + PARS_FETCH_TOKEN = 312, + PARS_CLOSE_TOKEN = 313, + PARS_NOTFOUND_TOKEN = 314, + PARS_TO_BINARY_TOKEN = 315, + PARS_SUBSTR_TOKEN = 316, + PARS_CONCAT_TOKEN = 317, + PARS_INSTR_TOKEN = 318, + PARS_LENGTH_TOKEN = 319, + PARS_COMMIT_TOKEN = 320, + PARS_ROLLBACK_TOKEN = 321, + PARS_WORK_TOKEN = 322, + PARS_EXIT_TOKEN = 323, + PARS_FUNCTION_TOKEN = 324, + PARS_LOCK_TOKEN = 325, + PARS_SHARE_TOKEN = 326, + PARS_MODE_TOKEN = 327, + PARS_LIKE_TOKEN = 328, + PARS_LIKE_TOKEN_EXACT = 329, + PARS_LIKE_TOKEN_PREFIX = 330, + PARS_LIKE_TOKEN_SUFFIX = 331, + PARS_LIKE_TOKEN_SUBSTR = 332, + PARS_TABLE_NAME_TOKEN = 333, + PARS_BIGINT_TOKEN = 334, + NEG = 335 + }; +#endif + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef int YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE yylval; + +int yyparse (void); + +#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED */ diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h new file mode 100644 index 00000000..07a726ea --- /dev/null +++ b/storage/innobase/include/pars0opt.h @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0opt.h +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0opt_h +#define pars0opt_h + +#include "que0types.h" +#include "pars0sym.h" +#include "row0sel.h" + +/*******************************************************************//** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node); /*!< in: parsed select node */ +/*******************************************************************//** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /*!< in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /*!< in: index to use */ + sym_node_list_t* col_list, /*!< in: base node of a list where + to add new found columns */ + plan_t* plan, /*!< in: plan or NULL */ + que_node_t* exp); /*!< in: expression or condition */ +#ifdef UNIV_SQL_DEBUG +/********************************************************************//** +Prints info of a query plan. */ +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node); /*!< in: select node */ +#endif /* UNIV_SQL_DEBUG */ + +#endif diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h new file mode 100644 index 00000000..03aa72d3 --- /dev/null +++ b/storage/innobase/include/pars0pars.h @@ -0,0 +1,724 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0pars.h +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +#ifndef pars0pars_h +#define pars0pars_h + +#include "que0types.h" +#include "pars0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" +#include "row0mysql.h" + +/** Type of the user functions. The first argument is always InnoDB-supplied +and varies in type, while 'user_arg' is a user-supplied argument. The +meaning of the return type also varies. See the individual use cases, e.g. +the FETCH statement, for details on them. */ +typedef ibool (*pars_user_func_cb_t)(void* arg, void* user_arg); + +/** If the following is set TRUE, the parser will emit debugging +information */ +extern int yydebug; + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +extern sym_tab_t* pars_sym_tab_global; + +extern pars_res_word_t pars_to_binary_token; +extern pars_res_word_t pars_substr_token; +extern pars_res_word_t pars_concat_token; +extern pars_res_word_t pars_length_token; +extern pars_res_word_t pars_instr_token; +extern pars_res_word_t pars_count_token; +extern pars_res_word_t pars_int_token; +extern pars_res_word_t pars_bigint_token; +extern pars_res_word_t pars_char_token; +extern pars_res_word_t pars_update_token; +extern pars_res_word_t pars_asc_token; +extern pars_res_word_t pars_desc_token; +extern pars_res_word_t pars_open_token; +extern pars_res_word_t pars_close_token; +extern pars_res_word_t pars_share_token; +extern pars_res_word_t pars_unique_token; +extern pars_res_word_t pars_clustered_token; + +extern ulint pars_star_denoter; + +/* Procedure parameter types */ +#define PARS_INPUT 0 +#define PARS_OUTPUT 1 +#define PARS_NOT_PARAM 2 + +int +yyparse(void); + +/*************************************************************//** +Parses an SQL string returning the query graph. +@return own: the query graph */ +que_t* +pars_sql( +/*=====*/ + pars_info_t* info, /*!< in: extra information, or NULL */ + const char* str); /*!< in: SQL string */ +/*************************************************************//** +Retrieves characters to the lexical analyzer. +@return number of characters copied or 0 on EOF */ +int +pars_get_lex_chars( +/*===============*/ + char* buf, /*!< in/out: buffer where to copy */ + size_t max_size); /*!< in: maximum number of characters which fit + in the buffer */ +/*************************************************************//** +Called by yyparse on error. */ +void +yyerror( +/*====*/ + const char* s); /*!< in: error message string */ +/*********************************************************************//** +Parses a variable declaration. +@return own: symbol table node of type SYM_VAR */ +sym_node_t* +pars_variable_declaration( +/*======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type); /*!< in: pointer to a type token */ +/*********************************************************************//** +Parses a function expression. +@return own: function node in a query tree */ +func_node_t* +pars_func( +/*======*/ + que_node_t* res_word,/*!< in: function name reserved word */ + que_node_t* arg); /*!< in: first argument in the argument list */ +/************************************************************************* +Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded +within the search string. +@return own: function node in a query tree */ +int +pars_like_rebind( +/*=============*/ + sym_node_t* node, /* in: The search string node.*/ + const byte* ptr, /* in: literal to (re) bind */ + ulint len); /* in: length of literal to (re) bind*/ +/*********************************************************************//** +Parses an operator expression. +@return own: function node in a query tree */ +func_node_t* +pars_op( +/*====*/ + int func, /*!< in: operator token code */ + que_node_t* arg1, /*!< in: first argument */ + que_node_t* arg2); /*!< in: second argument or NULL for an unary + operator */ +/*********************************************************************//** +Parses an ORDER BY clause. Order by a single column only is supported. +@return own: order-by node in a query tree */ +order_node_t* +pars_order_by( +/*==========*/ + sym_node_t* column, /*!< in: column name */ + pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */ +/*********************************************************************//** +Parses a select list; creates a query graph node for the whole SELECT +statement. +@return own: select node in a query tree */ +sel_node_t* +pars_select_list( +/*=============*/ + que_node_t* select_list, /*!< in: select list */ + sym_node_t* into_list); /*!< in: variables list or NULL */ +/*********************************************************************//** +Parses a cursor declaration. +@return sym_node */ +que_node_t* +pars_cursor_declaration( +/*====================*/ + sym_node_t* sym_node, /*!< in: cursor id node in the symbol + table */ + sel_node_t* select_node); /*!< in: select node */ +/*********************************************************************//** +Parses a function declaration. +@return sym_node */ +que_node_t* +pars_function_declaration( +/*======================*/ + sym_node_t* sym_node); /*!< in: function id node in the symbol + table */ +/*********************************************************************//** +Parses a select statement. +@return own: select node in a query tree */ +sel_node_t* +pars_select_statement( +/*==================*/ + sel_node_t* select_node, /*!< in: select node already containing + the select list */ + sym_node_t* table_list, /*!< in: table list */ + que_node_t* search_cond, /*!< in: search condition or NULL */ + pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */ + pars_res_word_t* consistent_read,/*!< in: NULL or + &pars_consistent_token */ + order_node_t* order_by); /*!< in: NULL or an order-by node */ +/*********************************************************************//** +Parses a column assignment in an update. +@return column assignment node */ +col_assign_node_t* +pars_column_assignment( +/*===================*/ + sym_node_t* column, /*!< in: column to assign */ + que_node_t* exp); /*!< in: value to assign */ +/*********************************************************************//** +Parses a delete or update statement start. +@return own: update node in a query tree */ +upd_node_t* +pars_update_statement_start( +/*========================*/ + ibool is_delete, /*!< in: TRUE if delete */ + sym_node_t* table_sym, /*!< in: table name node */ + col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL + if delete */ +/*********************************************************************//** +Parses an update or delete statement. +@return own: update node in a query tree */ +upd_node_t* +pars_update_statement( +/*==================*/ + upd_node_t* node, /*!< in: update node */ + sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond); /*!< in: search condition or NULL */ +/*********************************************************************//** +Parses an insert statement. +@return own: update node in a query tree */ +ins_node_t* +pars_insert_statement( +/*==================*/ + sym_node_t* table_sym, /*!< in: table name node */ + que_node_t* values_list, /*!< in: value expression list or NULL */ + sel_node_t* select); /*!< in: select condition or NULL */ +/*********************************************************************//** +Parses an elsif element. +@return elsif node */ +elsif_node_t* +pars_elsif_element( +/*===============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an if-statement. +@return if-statement node */ +if_node_t* +pars_if_statement( +/*==============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list, /*!< in: statement list */ + que_node_t* else_part); /*!< in: else-part statement list */ +/*********************************************************************//** +Parses a for-loop-statement. +@return for-statement node */ +for_node_t* +pars_for_statement( +/*===============*/ + sym_node_t* loop_var, /*!< in: loop variable */ + que_node_t* loop_start_limit,/*!< in: loop start expression */ + que_node_t* loop_end_limit, /*!< in: loop end expression */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses a while-statement. +@return while-statement node */ +while_node_t* +pars_while_statement( +/*=================*/ + que_node_t* cond, /*!< in: while-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an exit statement. +@return exit statement node */ +exit_node_t* +pars_exit_statement(void); +/*=====================*/ +/*********************************************************************//** +Parses a return-statement. +@return return-statement node */ +return_node_t* +pars_return_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a procedure call. +@return function node */ +func_node_t* +pars_procedure_call( +/*================*/ + que_node_t* res_word,/*!< in: procedure name reserved word */ + que_node_t* args); /*!< in: argument list */ +/*********************************************************************//** +Parses an assignment statement. +@return assignment statement node */ +assign_node_t* +pars_assignment_statement( +/*======================*/ + sym_node_t* var, /*!< in: variable to assign */ + que_node_t* val); /*!< in: value to assign */ +/*********************************************************************//** +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. +@return fetch statement node */ +fetch_node_t* +pars_fetch_statement( +/*=================*/ + sym_node_t* cursor, /*!< in: cursor node */ + sym_node_t* into_list, /*!< in: variables to set, or NULL */ + sym_node_t* user_func); /*!< in: user function name, or NULL */ +/*********************************************************************//** +Parses an open or close cursor statement. +@return fetch statement node */ +open_node_t* +pars_open_statement( +/*================*/ + ulint type, /*!< in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor); /*!< in: cursor node */ +/*********************************************************************//** +Parses a row_printf-statement. +@return row_printf-statement node */ +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + sel_node_t* sel_node); /*!< in: select node */ +/*********************************************************************//** +Parses a commit statement. +@return own: commit node struct */ +commit_node_t* +pars_commit_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a rollback statement. +@return own: rollback node struct */ +roll_node_t* +pars_rollback_statement(void); +/*=========================*/ +/*********************************************************************//** +Parses a column definition at a table creation. +@return column sym table node */ +sym_node_t* +pars_column_def( +/*============*/ + sym_node_t* sym_node, /*!< in: column node in the + symbol table */ + pars_res_word_t* type, /*!< in: data type */ + sym_node_t* len, /*!< in: length of column, or + NULL */ + void* is_not_null); /*!< in: if not NULL, column + is of type NOT NULL. */ +/*********************************************************************//** +Parses a table creation operation. +@return table create subgraph */ +tab_node_t* +pars_create_table( +/*==============*/ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_defs); /*!< in: list of column names */ +/*********************************************************************//** +Parses an index creation operation. +@return index create subgraph */ +ind_node_t* +pars_create_index( +/*==============*/ + pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */ + sym_node_t* index_sym, /*!< in: index name node in the symbol + table */ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_list); /*!< in: list of column names */ +/*********************************************************************//** +Parses a procedure definition. +@return query fork node */ +que_fork_t* +pars_procedure_definition( +/*======================*/ + sym_node_t* sym_node, /*!< in: procedure id node in the symbol + table */ + que_node_t* stat_list); /*!< in: statement list */ + +/*************************************************************//** +Parses a stored procedure call, when this is not within another stored +procedure, that is, the client issues a procedure call directly. +In MySQL/InnoDB, stored InnoDB procedures are invoked via the +parsed procedure tree, not via InnoDB SQL, so this function is not used. +@return query graph */ +que_fork_t* +pars_stored_procedure_call( +/*=======================*/ + sym_node_t* sym_node); /*!< in: stored procedure name */ +/** Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. The fork created is of +type QUE_FORK_MYSQL_INTERFACE. +@param[in] node root node for an incomplete query + graph, or NULL for dummy graph +@param[in] trx transaction handle +@param[in] heap memory heap from which allocated +@param[in] prebuilt row prebuilt structure +@return query thread node to run */ +que_thr_t* +pars_complete_graph_for_exec( + que_node_t* node, + trx_t* trx, + mem_heap_t* heap, + row_prebuilt_t* prebuilt) + MY_ATTRIBUTE((nonnull(2,3), warn_unused_result)); + +/****************************************************************//** +Create parser info struct. +@return own: info struct */ +pars_info_t* +pars_info_create(void); +/*==================*/ + +/****************************************************************//** +Free info struct and everything it contains. */ +void +pars_info_free( +/*===========*/ + pars_info_t* info); /*!< in, own: info struct */ + +/****************************************************************//** +Add bound literal. */ +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const void* address, /*!< in: address */ + ulint length, /*!< in: length of data */ + ulint type, /*!< in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /*!< in: precise type, e.g. + DATA_UNSIGNED */ + +/****************************************************************//** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* str); /*!< in: string */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +void +pars_info_bind_literal( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /* in: precise type, e.g. */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +void +pars_info_bind_varchar_literal( +/*===========================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const byte* str, /*!< in: string */ + ulint str_len); /*!< in: string length */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +void +pars_info_bind_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint32_t* val); /*!< in: value */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +void +pars_info_bind_int8_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val); /*!< in: value */ +/****************************************************************//** +Add user function. */ +void +pars_info_bind_function( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: function name */ + pars_user_func_cb_t func, /*!< in: function address */ + void* arg); /*!< in: user-supplied argument */ +/****************************************************************//** +Add bound id. */ +void +pars_info_bind_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + ibool copy_name,/* in: make a copy of name if TRUE */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ulint val); /*!< in: value */ + +/****************************************************************//** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +void +pars_info_add_ull_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ib_uint64_t val); /*!< in: value */ + +/****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ + MY_ATTRIBUTE((nonnull)); + +/****************************************************************//** +Add bound id. */ +void +pars_info_add_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ + +/****************************************************************//** +Get bound literal with the given name. +@return bound literal, or NULL if not found */ +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound literal name to find */ + +/****************************************************************//** +Get bound id with the given name. +@return bound id, or NULL if not found */ +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound id name to find */ + +/******************************************************************//** +Release any resources used by the lexer. */ +void +pars_lexer_close(void); +/*==================*/ + +/** Extra information supplied for pars_sql(). */ +struct pars_info_t { + mem_heap_t* heap; /*!< our own memory heap */ + + ib_vector_t* funcs; /*!< user functions, or NUll + (pars_user_func_t*) */ + ib_vector_t* bound_lits; /*!< bound literals, or NULL + (pars_bound_lit_t*) */ + ib_vector_t* bound_ids; /*!< bound ids, or NULL + (pars_bound_id_t*) */ + + ibool graph_owns_us; /*!< if TRUE (which is the default), + que_graph_free() will free us */ +}; + +/** User-supplied function and argument. */ +struct pars_user_func_t { + const char* name; /*!< function name */ + pars_user_func_cb_t func; /*!< function address */ + void* arg; /*!< user-supplied argument */ +}; + +/** Bound literal. */ +struct pars_bound_lit_t { + const char* name; /*!< name */ + const void* address; /*!< address */ + ulint length; /*!< length of data */ + ulint type; /*!< type, e.g. DATA_FIXBINARY */ + ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */ + sym_node_t* node; /*!< symbol node */ +}; + +/** Bound identifier. */ +struct pars_bound_id_t { + const char* name; /*!< name */ + const char* id; /*!< identifier */ +}; + +/** Struct used to denote a reserved word in a parsing tree */ +struct pars_res_word_t{ + int code; /*!< the token code for the reserved word from + pars0grm.h */ +}; + +/** A predefined function or operator node in a parsing tree; this construct +is also used for some non-functions like the assignment ':=' */ +struct func_node_t{ + que_common_t common; /*!< type: QUE_NODE_FUNC */ + int func; /*!< token code of the function name */ + ulint fclass; /*!< class of the function */ + que_node_t* args; /*!< argument(s) of the function */ + UT_LIST_NODE_T(func_node_t) cond_list; + /*!< list of comparison conditions; defined + only for comparison operator nodes except, + presently, for OPT_SCROLL_TYPE ones */ + UT_LIST_NODE_T(func_node_t) func_node_list; + /*!< list of function nodes in a parsed + query graph */ +}; + +/** An order-by node in a select */ +struct order_node_t{ + que_common_t common; /*!< type: QUE_NODE_ORDER */ + sym_node_t* column; /*!< order-by column */ + ibool asc; /*!< TRUE if ascending, FALSE if descending */ +}; + +/** Procedure definition node */ +struct proc_node_t{ + que_common_t common; /*!< type: QUE_NODE_PROC */ + sym_node_t* proc_id; /*!< procedure name symbol in the symbol + table of this same procedure */ + que_node_t* stat_list; /*!< statement list */ + sym_tab_t* sym_tab; /*!< symbol table of this procedure */ +}; + +/** elsif-element node */ +struct elsif_node_t{ + que_common_t common; /*!< type: QUE_NODE_ELSIF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** if-statement node */ +struct if_node_t{ + que_common_t common; /*!< type: QUE_NODE_IF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ + que_node_t* else_part; /*!< else-part statement list */ + elsif_node_t* elsif_list; /*!< elsif element list */ +}; + +/** while-statement node */ +struct while_node_t{ + que_common_t common; /*!< type: QUE_NODE_WHILE */ + que_node_t* cond; /*!< while condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** for-loop-statement node */ +struct for_node_t{ + que_common_t common; /*!< type: QUE_NODE_FOR */ + sym_node_t* loop_var; /*!< loop variable: this is the + dereferenced symbol from the + variable declarations, not the + symbol occurrence in the for loop + definition */ + que_node_t* loop_start_limit;/*!< initial value of loop variable */ + que_node_t* loop_end_limit; /*!< end value of loop variable */ + lint loop_end_value; /*!< evaluated value for the end value: + it is calculated only when the loop + is entered, and will not change within + the loop */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** exit statement node */ +struct exit_node_t{ + que_common_t common; /*!< type: QUE_NODE_EXIT */ +}; + +/** return-statement node */ +struct return_node_t{ + que_common_t common; /*!< type: QUE_NODE_RETURN */ +}; + +/** Assignment statement node */ +struct assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */ + sym_node_t* var; /*!< variable to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Column assignment node */ +struct col_assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */ + sym_node_t* col; /*!< column to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Classes of functions */ +/* @{ */ +#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */ +#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */ +#define PARS_FUNC_CMP 3 /*!< comparison operators */ +#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */ +#define PARS_FUNC_AGGREGATE 5 /*!< COUNT */ +#define PARS_FUNC_OTHER 6 /*!< these are not real functions, + e.g., := */ +/* @} */ + +#endif diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h new file mode 100644 index 00000000..59f6cc31 --- /dev/null +++ b/storage/innobase/include/pars0sym.h @@ -0,0 +1,243 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0sym.h +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0sym_h +#define pars0sym_h + +#include "que0types.h" +#include "pars0types.h" +#include "row0types.h" + +/******************************************************************//** +Creates a symbol table for a single stored procedure or query. +@return own: symbol table */ +sym_tab_t* +sym_tab_create( +/*===========*/ + mem_heap_t* heap); /*!< in: memory heap where to create */ +/******************************************************************//** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab); /*!< in, own: symbol table */ +/******************************************************************//** +Adds an integer literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + ulint val); /*!< in: integer value */ +/******************************************************************//** +Adds an string literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const byte* str, /*!< in: string with no quotes around + it */ + ulint len); /*!< in: string length */ +/******************************************************************//** +Add a bound literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name, /*!< in: name of bound literal */ + ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */ +/********************************************************************** +Rebind literal to a node in the symbol table. */ +sym_node_t* +sym_tab_rebind_lit( +/*===============*/ + /* out: symbol table node */ + sym_node_t* node, /* in: node that is bound to literal*/ + const void* address, /* in: pointer to data */ + ulint length); /* in: length of data */ +/******************************************************************//** +Adds an SQL null literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + sym_tab_t* sym_tab); /*!< in: symbol table */ +/******************************************************************//** +Adds an identifier to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + byte* name, /*!< in: identifier name */ + ulint len); /*!< in: identifier length */ + +/******************************************************************//** +Add a bound identifier to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_bound_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name); /*!< in: name of bound id */ + +/** Index of sym_node_t::field_nos corresponding to the clustered index */ +#define SYM_CLUST_FIELD_NO 0 +/** Index of sym_node_t::field_nos corresponding to a secondary index */ +#define SYM_SEC_FIELD_NO 1 + +/** Types of a symbol table node */ +enum sym_tab_entry { + SYM_UNSET, /*!< Unset entry. */ + SYM_VAR = 91, /*!< declared parameter or local + variable of a procedure */ + SYM_IMPLICIT_VAR, /*!< storage for a intermediate result + of a calculation */ + SYM_LIT, /*!< literal */ + SYM_TABLE_REF_COUNTED, /*!< database table name, ref counted. Must + be closed explicitly. */ + SYM_TABLE, /*!< database table name */ + SYM_COLUMN, /*!< database table name */ + SYM_CURSOR, /*!< named cursor */ + SYM_PROCEDURE_NAME, /*!< stored procedure name */ + SYM_INDEX, /*!< database index name */ + SYM_FUNCTION /*!< user function name */ +}; + +/** Symbol table node */ +struct sym_node_t{ + que_common_t common; /*!< node type: + QUE_NODE_SYMBOL */ + /* NOTE: if the data field in 'common.val' is not NULL and the symbol + table node is not for a temporary column, the memory for the value has + been allocated from dynamic memory and it should be freed when the + symbol table is discarded */ + + /* 'alias' and 'indirection' are almost the same, but not quite. + 'alias' always points to the primary instance of the variable, while + 'indirection' does the same only if we should use the primary + instance's values for the node's data. This is usually the case, but + when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM + t WHERE id = x;"), we copy the values from the primary instance to + the cursor's instance so that they are fixed for the duration of the + cursor, and set 'indirection' to NULL. If we did not, the value of + 'x' could change between fetches and things would break horribly. + + TODO: It would be cleaner to make 'indirection' a boolean field and + always use 'alias' to refer to the primary node. */ + + sym_node_t* indirection; /*!< pointer to + another symbol table + node which contains + the value for this + node, NULL otherwise */ + sym_node_t* alias; /*!< pointer to + another symbol table + node for which this + node is an alias, + NULL otherwise */ + UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table + columns or a list of + input variables for an + explicit cursor */ + ibool copy_val; /*!< TRUE if a column + and its value should + be copied to dynamic + memory when fetched */ + ulint field_nos[2]; /*!< if a column, in + the position + SYM_CLUST_FIELD_NO is + the field number in the + clustered index; in + the position + SYM_SEC_FIELD_NO + the field number in the + non-clustered index to + use first; if not found + from the index, then + ULINT_UNDEFINED */ + ibool resolved; /*!< TRUE if the + meaning of a variable + or a column has been + resolved; for literals + this is always TRUE */ + enum sym_tab_entry token_type; /*!< type of the + parsed token */ + const char* name; /*!< name of an id */ + ulint name_len; /*!< id name length */ + dict_table_t* table; /*!< table definition + if a table id or a + column id */ + ulint col_no; /*!< column number if a + column */ + sel_buf_t* prefetch_buf; /*!< NULL, or a buffer + for cached column + values for prefetched + rows */ + sel_node_t* cursor_def; /*!< cursor definition + select node if a + named cursor */ + ulint param_type; /*!< PARS_INPUT, + PARS_OUTPUT, or + PARS_NOT_PARAM if not a + procedure parameter */ + sym_tab_t* sym_table; /*!< back pointer to + the symbol table */ + UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol + nodes */ + sym_node_t* like_node; /* LIKE operator node*/ +}; + +/** Symbol table */ +struct sym_tab_t{ + que_t* query_graph; + /*!< query graph generated by the + parser */ + const char* sql_string; + /*!< SQL string to parse */ + size_t string_len; + /*!< SQL string length */ + size_t next_char_pos; + /*!< position of the next character in + sql_string to give to the lexical + analyzer */ + pars_info_t* info; /*!< extra information, or NULL */ + sym_node_list_t sym_list; + /*!< list of symbol nodes in the symbol + table */ + UT_LIST_BASE_NODE_T(func_node_t) + func_node_list; + /*!< list of function nodes in the + parsed query graph */ + mem_heap_t* heap; /*!< memory heap from which we can + allocate space */ +}; + +#endif diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h new file mode 100644 index 00000000..f5b69522 --- /dev/null +++ b/storage/innobase/include/pars0types.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0types.h +SQL parser global types + +Created 1/11/1998 Heikki Tuuri +*******************************************************/ + +#ifndef pars0types_h +#define pars0types_h + +struct pars_info_t; +struct pars_user_func_t; +struct pars_bound_lit_t; +struct pars_bound_id_t; +struct sym_node_t; +struct sym_tab_t; +struct pars_res_word_t; +struct func_node_t; +struct order_node_t; +struct proc_node_t; +struct elsif_node_t; +struct if_node_t; +struct while_node_t; +struct for_node_t; +struct exit_node_t; +struct return_node_t; +struct assign_node_t; +struct col_assign_node_t; + +typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t; + +#endif diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h new file mode 100644 index 00000000..e77857f4 --- /dev/null +++ b/storage/innobase/include/que0que.h @@ -0,0 +1,435 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.h +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0que_h +#define que0que_h + +#include "data0data.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "srv0srv.h" +#include "que0types.h" +#include "row0types.h" +#include "pars0types.h" + +/***********************************************************************//** +Creates a query graph fork node. +@return own: fork node */ +que_fork_t* +que_fork_create( +/*============*/ + que_t* graph, /*!< in: graph, if NULL then this + fork node is assumed to be the + graph root */ + que_node_t* parent, /*!< in: parent node */ + ulint fork_type, /*!< in: fork type */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent);/*!< in: parent */ +/** Creates a query graph thread node. +@param[in] parent parent node, i.e., a fork node +@param[in] heap memory heap where created +@param[in] prebuilt row prebuilt structure +@return own: query thread node */ +que_thr_t* +que_thr_create( + que_fork_t* parent, + mem_heap_t* heap, + row_prebuilt_t* prebuilt); +/**********************************************************************//** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node); /*!< in: query graph node */ +/**********************************************************************//** +Frees a query graph. */ +void +que_graph_free( +/*===========*/ + que_t* graph); /*!< in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ +/**********************************************************************//** +Stops a query thread if graph or trx is in a state requiring it. The +conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex +has to be reserved. +@return TRUE if stopped */ +ibool +que_thr_stop( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The +query thread is stopped and made inactive, except in the case where +it was put to the lock wait state in lock0lock.cc, but the lock has already +been granted or the transaction chosen as a victim in deadlock resolution. */ +void +que_thr_stop_for_mysql( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Run a query thread. Handles lock waits. */ +void +que_run_threads( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Moves a suspended query thread to the QUE_THR_RUNNING state and release +a worker thread to execute it. This function should be used to end +the wait state of a query thread waiting for a lock or a stored procedure +completion. +@return query thread instance of thread to wakeup or NULL */ +que_thr_t* +que_thr_end_lock_wait( +/*==================*/ + trx_t* trx); /*!< in: transaction in the + QUE_THR_LOCK_WAIT state */ +/**********************************************************************//** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +que_thr_t* +que_fork_start_command( +/*===================*/ + que_fork_t* fork); /*!< in: a query fork */ +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + const que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size); /*!< in: size */ +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node); /*!< in: node in a list */ +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node); /*!< in: node */ +/****************************************************************//** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. +@return containing loop node, or NULL. */ +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + que_node_t* node); /*!< in: node */ +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node); /*!< in: node */ +/************************************************************************* +Get the last node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: node last node from list.*/ + que_node_t* node_list); /* in: node list, or NULL */ +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list); /*!< in: node list, or NULL */ +/**********************************************************************//** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. +@return TRUE if should be stopped; NOTE that if the peek is made +without reserving the trx_t::mutex, then another peek with the mutex +reserved is necessary before deciding the actual stopping */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Returns TRUE if the query graph is for a SELECT statement. +@return TRUE if a select */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + que_t* graph); /*!< in: graph */ +/**********************************************************************//** +Prints info of an SQL query graph node. */ +void +que_node_print_info( +/*================*/ + que_node_t* node); /*!< in: query graph node */ +/*********************************************************************//** +Evaluate the given SQL +@return error code or DB_SUCCESS */ +dberr_t +que_eval_sql( +/*=========*/ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql, /*!< in: SQL string */ + bool reserve_dict_mutex, + /*!< in: whether to acquire/release + dict_sys.mutex around call to pars_sql. */ + trx_t* trx); /*!< in: trx */ + +/**********************************************************************//** +Round robin scheduler. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +que_thr_t* +que_fork_scheduler_round_robin( +/*===========================*/ + que_fork_t* fork, /*!< in: a query fork */ + que_thr_t* thr); /*!< in: current pos */ + +/** Query thread states */ +enum que_thr_state_t { + QUE_THR_RUNNING, + /** in selects this means that the thread is at the end of its + result set (or start, in case of a scroll cursor); in other + statements, this means the thread has done its task */ + QUE_THR_COMPLETED, + QUE_THR_COMMAND_WAIT, + QUE_THR_LOCK_WAIT, + QUE_THR_SUSPENDED +}; + +/** Query thread lock states */ +enum que_thr_lock_t { + QUE_THR_LOCK_NOLOCK, + QUE_THR_LOCK_ROW, + QUE_THR_LOCK_TABLE +}; + +/* Query graph query thread node: the fields are protected by the +trx_t::mutex with the exceptions named below */ + +struct que_thr_t{ + que_common_t common; /*!< type: QUE_NODE_THR */ + que_node_t* child; /*!< graph child node */ + que_t* graph; /*!< graph where this node belongs */ + que_thr_state_t state; /*!< state of the query thread */ + bool is_active; /*!< whether the thread is active */ + /*------------------------------*/ + /* The following fields are private to the OS thread executing the + query thread, and are not protected by any mutex: */ + + que_node_t* run_node; /*!< pointer to the node where the + subgraph down from this node is + currently executed */ + que_node_t* prev_node; /*!< pointer to the node from which + the control came */ + ulint resource; /*!< resource usage of the query thread + thus far */ + ulint lock_state; /*!< lock state of thread (table or + row) */ + struct srv_slot_t* + slot; /* The thread slot in the wait + array in srv_sys_t */ + /*------------------------------*/ + /* The following fields are links for the various lists that + this type can be on. */ + UT_LIST_NODE_T(que_thr_t) + thrs; /*!< list of thread nodes of the fork + node */ + UT_LIST_NODE_T(que_thr_t) + queue; /*!< list of runnable thread nodes in + the server task queue */ + ulint fk_cascade_depth; /*!< maximum cascading call depth + supported for foreign key constraint + related delete/updates */ + row_prebuilt_t* prebuilt; /*!< prebuilt structure processed by + the query thread */ + +#ifdef UNIV_DEBUG + /** Change the 'active' status */ + inline void set_active(bool active); +#endif + /** Transition to the QUE_THR_RUNNING state. */ + inline void start_running() + { + ut_d(if (!is_active) set_active(true)); + is_active= true; + state= QUE_THR_RUNNING; + } + + /** Stop query execution when there is no error or lock wait. */ + void stop_no_error() + { + ut_ad(is_active); + ut_d(set_active(false)); + state= QUE_THR_COMPLETED; + is_active= false; + } +}; + +/* Query graph fork node: its fields are protected by the query thread mutex */ +struct que_fork_t{ + que_common_t common; /*!< type: QUE_NODE_FORK */ + que_t* graph; /*!< query graph of this node */ + ulint fork_type; /*!< fork type */ +#ifdef UNIV_DEBUG + /** For the query graph root, updated in set_active() */ + ulint n_active_thrs; + /** Change the 'active' status */ + void set_active(bool active); +#endif + trx_t* trx; /*!< transaction: this is set only in + the root node */ + ulint state; /*!< state of the fork node */ + que_thr_t* caller; /*!< pointer to a possible calling query + thread */ + UT_LIST_BASE_NODE_T(que_thr_t) + thrs; /*!< list of query threads */ + /*------------------------------*/ + /* The fields in this section are defined only in the root node */ + sym_tab_t* sym_tab; /*!< symbol table of the query, + generated by the parser, or NULL + if the graph was created 'by hand' */ + pars_info_t* info; /*!< info struct, or NULL */ + + sel_node_t* last_sel_node; /*!< last executed select node, or NULL + if none */ + UT_LIST_NODE_T(que_fork_t) + graphs; /*!< list of query graphs of a session + or a stored procedure */ + /*------------------------------*/ + mem_heap_t* heap; /*!< memory heap where the fork was + created */ + +}; + +#ifdef UNIV_DEBUG +inline void que_thr_t::set_active(bool active) { graph->set_active(active); }; +#endif + +/* Query fork (or graph) types */ +#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */ +#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */ +#define QUE_FORK_INSERT 3 +#define QUE_FORK_UPDATE 4 +#define QUE_FORK_ROLLBACK 5 + /* This is really the undo graph used in rollback, + no signal-sending roll_node in this graph */ +#define QUE_FORK_PURGE 6 +#define QUE_FORK_EXECUTE 7 +#define QUE_FORK_PROCEDURE 8 +#define QUE_FORK_PROCEDURE_CALL 9 +#define QUE_FORK_MYSQL_INTERFACE 10 +#define QUE_FORK_RECOVERY 11 + +/* Query fork (or graph) states */ +#define QUE_FORK_ACTIVE 1 +#define QUE_FORK_COMMAND_WAIT 2 +#define QUE_FORK_INVALID 3 +#define QUE_FORK_BEING_FREED 4 + +/* Flag which is ORed to control structure statement node types */ +#define QUE_NODE_CONTROL_STAT 1024 + +#include "que0que.ic" + +#endif diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic new file mode 100644 index 00000000..1c3ac242 --- /dev/null +++ b/storage/innobase/include/que0que.ic @@ -0,0 +1,293 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.ic +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(thr); + + return(thr->graph->trx); +} + +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork) /*!< in: query fork */ +{ + return(UT_LIST_GET_FIRST(fork->thrs)); +} + +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork) /*!< in: query fork */ +{ + que_thr_t* thr; + + thr = UT_LIST_GET_FIRST(fork->thrs); + + return(thr->child); +} + +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + const que_node_t* node) /*!< in: graph node */ +{ + return(reinterpret_cast<const que_common_t*>(node)->type); +} + +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(&(((que_common_t*) node)->val)); +} + +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*) node)->val_buf_size); +} + +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size) /*!< in: size */ +{ + ut_ad(node); + + ((que_common_t*) node)->val_buf_size = size; +} + +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent) /*!< in: parent */ +{ + ut_ad(node); + + ((que_common_t*) node)->parent = parent; +} + +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(dfield_get_type(&((que_common_t*) node)->val)); +} + +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node) /*!< in: node */ +{ + que_common_t* cnode; + que_common_t* cnode2; + + cnode = (que_common_t*) node; + + cnode->brother = NULL; + + if (node_list == NULL) { + + return(node); + } + + cnode2 = (que_common_t*) node_list; + + while (cnode2->brother != NULL) { + cnode2 = (que_common_t*) cnode2->brother; + } + + cnode2->brother = node; + + return(node_list); +} + +/************************************************************************* +Removes a query graph node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: last node in list.*/ + que_node_t* node_list) /* in: node list */ +{ + que_common_t* node; + + ut_a(node_list != NULL); + + node = (que_common_t*) node_list; + + /* We need the last element */ + while (node->brother != NULL) { + node = (que_common_t*) node->brother; + } + + return(node); +} +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. +@return next node in a list of nodes */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node) /*!< in: node in a list */ +{ + return(((que_common_t*) node)->brother); +} + +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list) /*!< in: node list, or NULL */ +{ + const que_common_t* cnode; + ulint len; + + cnode = (const que_common_t*) node_list; + len = 0; + + while (cnode != NULL) { + len++; + cnode = (const que_common_t*) cnode->brother; + } + + return(len); +} + +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node) /*!< in: node */ +{ + return(((que_common_t*) node)->parent); +} + +/**********************************************************************//** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. +@return TRUE if should be stopped; NOTE that if the peek is made +without reserving the trx mutex, then another peek with the mutex +reserved is necessary before deciding the actual stopping */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + que_t* graph; + + graph = thr->graph; + trx = graph->trx; + + if (graph->state != QUE_FORK_ACTIVE + || trx->lock.que_state == TRX_QUE_LOCK_WAIT + || (trx->lock.que_state != TRX_QUE_ROLLING_BACK + && trx->lock.que_state != TRX_QUE_RUNNING)) { + + return(TRUE); + } + + return(FALSE); +} + +/***********************************************************************//** +Returns TRUE if the query graph is for a SELECT statement. +@return TRUE if a select */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + que_t* graph) /*!< in: graph */ +{ + if (graph->fork_type == QUE_FORK_SELECT_SCROLL + || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) { + + return(TRUE); + } + + return(FALSE); +} + diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h new file mode 100644 index 00000000..38f6e380 --- /dev/null +++ b/storage/innobase/include/que0types.h @@ -0,0 +1,97 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0types.h +Query graph global types + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0types_h +#define que0types_h + +#include "data0data.h" + +/* Pseudotype for all graph nodes */ +typedef void que_node_t; + +/* Query graph root is a fork node */ +typedef struct que_fork_t que_t; + +struct row_prebuilt_t; +struct que_thr_t; + +/* Query graph node types */ +#define QUE_NODE_LOCK 1 +#define QUE_NODE_INSERT 2 +#define QUE_NODE_UPDATE 4 +#define QUE_NODE_CURSOR 5 +#define QUE_NODE_SELECT 6 +#define QUE_NODE_AGGREGATE 7 +#define QUE_NODE_FORK 8 +#define QUE_NODE_THR 9 +#define QUE_NODE_UNDO 10 +#define QUE_NODE_COMMIT 11 +#define QUE_NODE_ROLLBACK 12 +#define QUE_NODE_PURGE 13 +#define QUE_NODE_CREATE_TABLE 14 +#define QUE_NODE_CREATE_INDEX 15 +#define QUE_NODE_SYMBOL 16 +#define QUE_NODE_RES_WORD 17 +#define QUE_NODE_FUNC 18 +#define QUE_NODE_ORDER 19 +#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_ASSIGNMENT 23 +#define QUE_NODE_FETCH 24 +#define QUE_NODE_OPEN 25 +#define QUE_NODE_COL_ASSIGNMENT 26 +#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_RETURN 28 +#define QUE_NODE_ROW_PRINTF 29 +#define QUE_NODE_ELSIF 30 +#define QUE_NODE_CALL 31 +#define QUE_NODE_EXIT 32 + +/* Common struct at the beginning of each query graph node; the name of this +substruct must be 'common' */ + +struct que_common_t{ + ulint type; /*!< query node type */ + que_node_t* parent; /*!< back pointer to parent node, or NULL */ + que_node_t* brother;/* pointer to a possible brother node */ + dfield_t val; /*!< evaluated value for an expression */ + ulint val_buf_size; + /* buffer size for the evaluated value data, + if the buffer has been allocated dynamically: + if this field is != 0, and the node is a + symbol node or a function node, then we + have to free the data field in val + explicitly */ + + /** Constructor */ + que_common_t(ulint type, que_node_t* parent) : + type(type), parent(parent), brother(NULL), + val(), val_buf_size(0) + {} +}; + +#endif diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h new file mode 100644 index 00000000..21143ab6 --- /dev/null +++ b/storage/innobase/include/read0types.h @@ -0,0 +1,293 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0types.h +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0types_h +#define read0types_h + +#include "dict0mem.h" +#include "trx0types.h" +#include <algorithm> + + +/** + Read view lists the trx ids of those transactions for which a consistent read + should not see the modifications to the database. +*/ +class ReadViewBase +{ + /** + The read should not see any transaction with trx id >= this value. + In other words, this is the "high water mark". + */ + trx_id_t m_low_limit_id; + + /** + The read should see all trx ids which are strictly + smaller (<) than this value. In other words, this is the + low water mark". + */ + trx_id_t m_up_limit_id; + + /** Set of RW transactions that was active when this snapshot was taken */ + trx_ids_t m_ids; + + /** + The view does not need to see the undo logs for transactions whose + transaction number is strictly smaller (<) than this value: they can be + removed in purge if not needed by other views. + */ + trx_id_t m_low_limit_no; + +protected: + bool empty() { return m_ids.empty(); } + + /** @return the up limit id */ + trx_id_t up_limit_id() const { return m_up_limit_id; } + +public: + ReadViewBase(): m_low_limit_id(0) {} + + + /** + Append state from another view. + + This method is used to find min(m_low_limit_no), min(m_low_limit_id) and + all transaction ids below min(m_low_limit_id). These values effectively + form oldest view. + + @param other view to copy from + */ + void append(const ReadViewBase &other) + { + ut_ad(&other != this); + if (m_low_limit_no > other.m_low_limit_no) + m_low_limit_no= other.m_low_limit_no; + if (m_low_limit_id > other.m_low_limit_id) + m_low_limit_id= other.m_low_limit_id; + + trx_ids_t::iterator dst= m_ids.begin(); + for (const trx_id_t id : other.m_ids) + { + if (id >= m_low_limit_id) + break; +loop: + if (dst == m_ids.end()) + { + m_ids.push_back(id); + dst= m_ids.end(); + continue; + } + if (*dst < id) + { + dst++; + goto loop; + } + else if (*dst > id) + dst= m_ids.insert(dst, id) + 1; + } + m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id), + m_ids.end()); + + m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); + ut_ad(m_up_limit_id <= m_low_limit_id); + } + + + /** + Creates a snapshot where exactly the transactions serialized before this + point in time are seen in the view. + + @param[in,out] trx transaction + */ + inline void snapshot(trx_t *trx); + + + /** + Check whether transaction id is valid. + @param[in] id transaction id to check + @param[in] name table name + + @todo changes_visible() was an unfortunate choice for this check. + It should be moved towards the functions that load trx id like + trx_read_trx_id(). No need to issue a warning, error log message should + be enough. Although statement should ideally fail if it sees corrupt + data. + */ + static void check_trx_id_sanity(trx_id_t id, const table_name_t &name); + + + /** + Check whether the changes by id are visible. + @param[in] id transaction id to check against the view + @param[in] name table name + @return whether the view sees the modifications of id. + */ + bool changes_visible(trx_id_t id, const table_name_t &name) const + MY_ATTRIBUTE((warn_unused_result)) + { + if (id >= m_low_limit_id) + { + check_trx_id_sanity(id, name); + return false; + } + return id < m_up_limit_id || + m_ids.empty() || + !std::binary_search(m_ids.begin(), m_ids.end(), id); + } + + + /** + @param id transaction to check + @return true if view sees transaction id + */ + bool sees(trx_id_t id) const { return id < m_up_limit_id; } + + /** @return the low limit no */ + trx_id_t low_limit_no() const { return m_low_limit_no; } + + /** @return the low limit id */ + trx_id_t low_limit_id() const { return m_low_limit_id; } +}; + + +/** A ReadView with extra members required for trx_t::read_view. */ +class ReadView: public ReadViewBase +{ + /** + View state. + + Implemented as atomic to allow mutex-free view close and re-use. + Non-owner thread is allowed to call is_open() alone without mutex + protection as well. E.g. trx_sys.view_count() does this. + + If non-owner thread intends to access other members as well, both + is_open() and other members accesses must be protected by m_mutex. + E.g. copy_to(). + */ + std::atomic<bool> m_open; + + /** For synchronisation with purge coordinator. */ + mutable ib_mutex_t m_mutex; + + /** + trx id of creating transaction. + Used exclusively by the read view owner thread. + */ + trx_id_t m_creator_trx_id; + +public: + ReadView(): m_open(false) { mutex_create(LATCH_ID_READ_VIEW, &m_mutex); } + ~ReadView() { mutex_free(&m_mutex); } + + + /** + Opens a read view where exactly the transactions serialized before this + point in time are seen in the view. + + View becomes visible to purge thread. Intended to be called by the ReadView + owner thread. + + @param[in,out] trx transaction + */ + void open(trx_t *trx); + + + /** + Closes the view. + + View becomes not visible to purge thread. Intended to be called by the + ReadView owner thread. + */ + void close() { m_open.store(false, std::memory_order_relaxed); } + + + /** Returns true if view is open. */ + bool is_open() const { return m_open.load(std::memory_order_relaxed); } + + + /** + Sets the creator transaction id. + + This should be set only for views created by RW transactions. + Intended to be called by the ReadView owner thread. + */ + void set_creator_trx_id(trx_id_t id) + { + ut_ad(id > 0); + ut_ad(m_creator_trx_id == 0); + m_creator_trx_id= id; + } + + + /** + Writes the limits to the file. + @param file file to write to + */ + void print_limits(FILE *file) const + { + mutex_enter(&m_mutex); + if (is_open()) + fprintf(file, "Trx read view will not see trx with" + " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n", + low_limit_id(), up_limit_id()); + mutex_exit(&m_mutex); + } + + + /** + A wrapper around ReadViewBase::changes_visible(). + Intended to be called by the ReadView owner thread. + */ + bool changes_visible(trx_id_t id, const table_name_t &name) const + { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); } + + + /** + A wrapper around ReadViewBase::append(). + Intended to be called by the purge coordinator task. + */ + void append_to(ReadViewBase *to) const + { + mutex_enter(&m_mutex); + if (is_open()) + to->append(*this); + mutex_exit(&m_mutex); + } + + + /** + Declare the object mostly unaccessible. + innodb_monitor_set_option is operating also on freed transaction objects. + */ + void mem_noaccess() const + { + MEM_NOACCESS(&m_open, sizeof m_open); + /* m_mutex is accessed by innodb_show_mutex_status() + and innodb_monitor_update() even after trx_t::free() */ + MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id); + } +}; +#endif diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h new file mode 100644 index 00000000..8d770405 --- /dev/null +++ b/storage/innobase/include/rem0cmp.h @@ -0,0 +1,263 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.h +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#ifndef rem0cmp_h +#define rem0cmp_h + +#include "data0data.h" +#include "data0type.h" +#include "rem0types.h" +#include "page0types.h" + +/*************************************************************//** +Returns TRUE if two columns are equal for comparison purposes. +@return TRUE if the columns are considered equal in comparisons */ +ibool +cmp_cols_are_equal( +/*===============*/ + const dict_col_t* col1, /*!< in: column 1 */ + const dict_col_t* col2, /*!< in: column 2 */ + ibool check_charsets); + /*!< in: whether to check charsets */ +/** Compare two data fields. +@param[in] mtype main type +@param[in] prtype precise type +@param[in] data1 data field +@param[in] len1 length of data1 in bytes, or UNIV_SQL_NULL +@param[in] data2 data field +@param[in] len2 length of data2 in bytes, or UNIV_SQL_NULL +@return the comparison result of data1 and data2 +@retval 0 if data1 is equal to data2 +@retval negative if data1 is less than data2 +@retval positive if data1 is greater than data2 */ +int +cmp_data_data( + ulint mtype, + ulint prtype, + const byte* data1, + ulint len1, + const byte* data2, + ulint len2) + MY_ATTRIBUTE((warn_unused_result)); + +/** Compare two data fields. +@param[in] dfield1 data field; must have type field set +@param[in] dfield2 data field +@return the comparison result of dfield1 and dfield2 +@retval 0 if dfield1 is equal to dfield2 +@retval negative if dfield1 is less than dfield2 +@retval positive if dfield1 is greater than dfield2 */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + const dfield_t* dfield1,/*!< in: data field; must have type field set */ + const dfield_t* dfield2);/*!< in: data field */ + +#ifdef UNIV_DEBUG +/** Compare a GIS data tuple to a physical record. +@param[in] dtuple data tuple +@param[in] rec R-tree record +@param[in] mode compare mode +@retval negative if dtuple is less than rec */ +int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec, + page_cur_mode_t mode) + MY_ATTRIBUTE((nonnull)); +#endif + +/** Compare two minimum bounding rectangles. +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +inline int cmp_geometry_field(const void *a, const void *b) +{ + const byte *mbr1= static_cast<const byte*>(a); + const byte *mbr2= static_cast<const byte*>(b); + + static_assert(SPDIMS == 2, "compatibility"); + static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility"); + + /* Try to compare mbr left lower corner (xmin, ymin) */ + double x1= mach_double_read(mbr1); + double x2= mach_double_read(mbr2); + if (x1 > x2) + return 1; + if (x2 > x1) + return -1; + + double y1= mach_double_read(mbr1 + sizeof(double) * SPDIMS); + double y2= mach_double_read(mbr2 + sizeof(double) * SPDIMS); + + if (y1 > y2) + return 1; + if (y2 > y1) + return -1; + + /* left lower corner (xmin, ymin) overlaps, now right upper corner */ + x1= mach_double_read(mbr1 + sizeof(double)); + x2= mach_double_read(mbr2 + sizeof(double)); + + if (x1 > x2) + return 1; + if (x2 > x1) + return -1; + + y1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double)); + y2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double)); + + if (y1 > y2) + return 1; + if (y2 > y1) + return -1; + + return 0; +} + +/** Compare a data tuple to a physical record. +@param[in] dtuple data tuple +@param[in] rec B-tree record +@param[in] offsets rec_get_offsets(rec) +@param[in] n_cmp number of fields to compare +@param[in,out] matched_fields number of completely matched fields +@return the comparison result of dtuple and rec +@retval 0 if dtuple is equal to rec +@retval negative if dtuple is less than rec +@retval positive if dtuple is greater than rec */ +int +cmp_dtuple_rec_with_match_low( + const dtuple_t* dtuple, + const rec_t* rec, + const rec_offs* offsets, + ulint n_cmp, + ulint* matched_fields) + MY_ATTRIBUTE((nonnull)); +#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields) \ + cmp_dtuple_rec_with_match_low( \ + tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields) +/** Compare a data tuple to a physical record. +@param[in] dtuple data tuple +@param[in] rec B-tree or R-tree index record +@param[in] index index tree +@param[in] offsets rec_get_offsets(rec) +@param[in,out] matched_fields number of completely matched fields +@param[in,out] matched_bytes number of matched bytes in the first +field that is not matched +@return the comparison result of dtuple and rec +@retval 0 if dtuple is equal to rec +@retval negative if dtuple is less than rec +@retval positive if dtuple is greater than rec */ +int +cmp_dtuple_rec_with_match_bytes( + const dtuple_t* dtuple, + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + ulint* matched_fields, + ulint* matched_bytes) + MY_ATTRIBUTE((warn_unused_result)); +/** Compare a data tuple to a physical record. +@see cmp_dtuple_rec_with_match +@param[in] dtuple data tuple +@param[in] rec B-tree record +@param[in] offsets rec_get_offsets(rec) +@return the comparison result of dtuple and rec +@retval 0 if dtuple is equal to rec +@retval negative if dtuple is less than rec +@retval positive if dtuple is greater than rec */ +int +cmp_dtuple_rec( + const dtuple_t* dtuple, + const rec_t* rec, + const rec_offs* offsets); +/**************************************************************//** +Checks if a dtuple is a prefix of a record. The last field in dtuple +is allowed to be a prefix of the corresponding field in the record. +@return TRUE if prefix */ +ibool +cmp_dtuple_is_prefix_of_rec( +/*========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ +/** Compare two physical records that contain the same number of columns, +none of which are stored externally. +@retval positive if rec1 (including non-ordering columns) is greater than rec2 +@retval negative if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ +int +cmp_rec_rec_simple( +/*===============*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const rec_offs* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const rec_offs* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ + MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result)); + +/** Compare two B-tree or R-tree records. +Only the common first fields are compared, and externally stored field +are treated as equal. +@param[in] rec1 record (possibly not on an index page) +@param[in] rec2 B-tree or R-tree record in an index page +@param[in] offsets1 rec_get_offsets(rec1, index) +@param[in] offsets2 rec_get_offsets(rec2, index) +@param[in] nulls_unequal true if this is for index cardinality + statistics estimation with + innodb_stats_method=nulls_unequal + or innodb_stats_method=nulls_ignored +@param[out] matched_fields number of completely matched fields + within the first field not completely matched +@retval 0 if rec1 is equal to rec2 +@retval negative if rec1 is less than rec2 +@retval positive if rec1 is greater than rec2 */ +int +cmp_rec_rec( + const rec_t* rec1, + const rec_t* rec2, + const rec_offs* offsets1, + const rec_offs* offsets2, + const dict_index_t* index, + bool nulls_unequal = false, + ulint* matched_fields = NULL) + MY_ATTRIBUTE((nonnull(1,2,3,4,5))); + +/** Compare two data fields. +@param[in] dfield1 data field +@param[in] dfield2 data field +@return the comparison result of dfield1 and dfield2 +@retval 0 if dfield1 is equal to dfield2, or a prefix of dfield1 +@retval negative if dfield1 is less than dfield2 +@retval positive if dfield1 is greater than dfield2 */ +UNIV_INLINE +int +cmp_dfield_dfield_like_prefix( + const dfield_t* dfield1, + const dfield_t* dfield2); + +#include "rem0cmp.ic" + +#endif diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic new file mode 100644 index 00000000..6e21382d --- /dev/null +++ b/storage/innobase/include/rem0cmp.ic @@ -0,0 +1,107 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.ic +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#include <mysql_com.h> +#include <my_sys.h> + +/** Compare two data fields. +@param[in] dfield1 data field; must have type field set +@param[in] dfield2 data field +@return the comparison result of dfield1 and dfield2 +@retval 0 if dfield1 is equal to dfield2 +@retval negative if dfield1 is less than dfield2 +@retval positive if dfield1 is greater than dfield2 */ +UNIV_INLINE +int +cmp_dfield_dfield( + const dfield_t* dfield1, + const dfield_t* dfield2) +{ + const dtype_t* type; + + ut_ad(dfield_check_typed(dfield1)); + + type = dfield_get_type(dfield1); + + return(cmp_data_data(type->mtype, type->prtype, + (const byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (const byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} + +/** Compare two data fields. +@param[in] dfield1 data field +@param[in] dfield2 data field +@return the comparison result of dfield1 and dfield2 +@retval 0 if dfield1 is equal to dfield2, or a prefix of dfield1 +@retval negative if dfield1 is less than dfield2 +@retval positive if dfield1 is greater than dfield2 */ +UNIV_INLINE +int +cmp_dfield_dfield_like_prefix( + const dfield_t* dfield1, + const dfield_t* dfield2) +{ + const dtype_t* type; + + ut_ad(dfield_check_typed(dfield1)); + ut_ad(dfield_check_typed(dfield2)); + + type = dfield_get_type(dfield1); + +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + uint cs_num = (uint) dtype_get_charset_coll(type->prtype); + + if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) { + return(cs->strnncoll( + static_cast<const uchar*>( + dfield_get_data(dfield1)), + dfield_get_len(dfield1), + static_cast<const uchar*>( + dfield_get_data(dfield2)), + dfield_get_len(dfield2), + 1)); + } + + ib::fatal() << "Unable to find charset-collation " << cs_num; + return(0); +} diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h new file mode 100644 index 00000000..dbcff3e6 --- /dev/null +++ b/storage/innobase/include/rem0rec.h @@ -0,0 +1,1299 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.h +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0rec_h +#define rem0rec_h + +#ifndef UNIV_INNOCHECKSUM +#include "data0data.h" +#include "rem0types.h" +#include "mtr0types.h" +#include "page0types.h" +#include "dict0dict.h" +#include "trx0types.h" +#endif /*! UNIV_INNOCHECKSUM */ +#include <ostream> +#include <sstream> + +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 + +/* The following four constants are needed in page0zip.cc in order to +efficiently compress and decompress pages. */ + +/* The offset of heap_no in a compact record */ +#define REC_NEW_HEAP_NO 4 +/* The shift of heap_no in a compact record. +The status is stored in the low-order bits. */ +#define REC_HEAP_NO_SHIFT 3 + +/* Length of a B-tree node pointer, in bytes */ +#define REC_NODE_PTR_SIZE 4 + +#ifndef UNIV_INNOCHECKSUM +/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */ +constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80; +/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */ +constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000; + +/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most +significant bit denotes that the tail of a field is stored off-page. */ +constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000; + +constexpr size_t RECORD_OFFSET= 2; +constexpr size_t INDEX_OFFSET= + RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs); +#endif /* UNIV_INNOCHECKSUM */ + +/* Length of the rec_get_offsets() header */ +constexpr size_t REC_OFFS_HEADER_SIZE= +#ifdef UNIV_DEBUG +#ifndef UNIV_INNOCHECKSUM + sizeof(rec_t *) / sizeof(rec_offs) + + sizeof(dict_index_t *) / sizeof(rec_offs) + +#endif /* UNIV_INNOCHECKSUM */ +#endif /* UNIV_DEBUG */ + 2; + +/* Number of elements that should be initially allocated for the +offsets[] array, first passed to rec_get_offsets() */ +constexpr size_t REC_OFFS_NORMAL_SIZE= 300; +constexpr size_t REC_OFFS_SMALL_SIZE= 18; +constexpr size_t REC_OFFS_SEC_INDEX_SIZE= + /* PK max key parts */ 16 + /* sec idx max key parts */ 16 + + /* child page number for non-leaf pages */ 1; + +/** Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +#ifndef UNIV_INNOCHECKSUM +/* Offset consists of two parts: 2 upper bits is type and all other bits is +value */ + +/** Only 4 different values is possible! */ +enum field_type_t +{ + /** normal field */ + STORED_IN_RECORD= 0 << 14, + /** this field is stored off-page */ + STORED_OFFPAGE= 1 << 14, + /** just an SQL NULL */ + SQL_NULL= 2 << 14, + /** instantly added field */ + DEFAULT= 3 << 14, +}; + +/** without 2 upper bits */ +static constexpr rec_offs DATA_MASK= 0x3fff; +/** 2 upper bits */ +static constexpr rec_offs TYPE_MASK= ~DATA_MASK; +inline field_type_t get_type(rec_offs n) +{ + return static_cast<field_type_t>(n & TYPE_MASK); +} +inline void set_type(rec_offs &n, field_type_t type) +{ + n= static_cast<rec_offs>((n & DATA_MASK) | type); +} +inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; } +inline rec_offs combine(rec_offs value, field_type_t type) +{ + return static_cast<rec_offs>(get_value(value) | type); +} + +/** Compact flag ORed to the extra size returned by rec_get_offsets() */ +constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1)); +/** External flag in offsets returned by rec_get_offsets() */ +constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1; +/** Default value flag in offsets returned by rec_get_offsets() */ +constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2; +constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1; +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the offset of the +next chained record on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ + MY_ATTRIBUTE((nonnull)); +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ + MY_ATTRIBUTE((nonnull)); +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Confirms the n_fields of the entry is sane with comparing the other +record in the same page specified +@param[in] index index +@param[in] rec record of the same page +@param[in] entry index entry +@return true if n_fields is sane */ +UNIV_INLINE +bool +rec_n_fields_is_sane( + dict_index_t* index, + const rec_t* rec, + const dtuple_t* entry) + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +The following function is used to retrieve the info bits of +a record. +@return info bits */ +UNIV_INLINE +byte +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Determine the status bits of a non-REDUNDANT record. +@param[in] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record +@return status bits */ +inline +rec_comp_status_t +rec_get_status(const rec_t* rec) +{ + byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK; + ut_ad(bits <= REC_STATUS_INSTANT); + return static_cast<rec_comp_status_t>(bits); +} + +/** Set the status bits of a non-REDUNDANT record. +@param[in,out] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record +@param[in] bits status bits */ +inline void rec_set_status(rec_t *rec, byte bits) +{ + ut_ad(bits <= REC_STATUS_INSTANT); + rec[-REC_NEW_STATUS]= static_cast<byte>((rec[-REC_NEW_STATUS] & + ~REC_NEW_STATUS_MASK) | bits); +} + +/** Get the length of added field count in a REC_STATUS_INSTANT record. +@param[in] n_add_field number of added fields, minus one +@return storage size of the field count, in bytes */ +inline unsigned rec_get_n_add_field_len(ulint n_add_field) +{ + ut_ad(n_add_field < REC_MAX_N_FIELDS); + return n_add_field < 0x80 ? 1 : 2; +} + +/** Get the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record +@return number of added fields */ +inline unsigned rec_get_n_add_field(const byte*& header) +{ + unsigned n_fields_add = *--header; + if (n_fields_add < 0x80) { + ut_ad(rec_get_n_add_field_len(n_fields_add) == 1); + return n_fields_add; + } + + n_fields_add &= 0x7f; + n_fields_add |= unsigned(*--header) << 7; + ut_ad(n_fields_add < REC_MAX_N_FIELDS); + ut_ad(rec_get_n_add_field_len(n_fields_add) == 2); + return n_fields_add; +} + +/** Set the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record +@param[in] n_add number of added fields, minus 1 +@return record header before the number of added fields */ +inline void rec_set_n_add_field(byte*& header, ulint n_add) +{ + ut_ad(n_add < REC_MAX_N_FIELDS); + + if (n_add < 0x80) { + *header-- = byte(n_add); + } else { + *header-- = byte(byte(n_add) | 0x80); + *header-- = byte(n_add >> 7); + } +} + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info and status bits */ +UNIV_INLINE +byte +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: compact physical record */ + ulint bits) /*!< in: info bits */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +bool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to test whether the data offsets +in the record are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +uint8_t +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +uint16_t +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Determine how many of the first n columns in a compact +physical record are stored externally. +@return number of externally stored columns */ +ulint +rec_get_n_extern_new( +/*=================*/ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Determine the offsets to each field in an index record. +@param[in] rec physical record +@param[in] index the index that the record belongs to +@param[in,out] offsets array comprising offsets[0] allocated elements, + or an array from rec_get_offsets(), or NULL +@param[in] n_core 0, or index->n_core_fields for leaf page +@param[in] n_fields maximum number of offsets to compute + (ULINT_UNDEFINED to compute all offsets) +@param[in,out] heap memory heap +@return the new offsets */ +rec_offs* +rec_get_offsets_func( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + ulint n_fields, +#ifdef UNIV_DEBUG + const char* file, /*!< in: file name where called */ + unsigned line, /*!< in: line number where called */ +#endif /* UNIV_DEBUG */ + mem_heap_t** heap) /*!< in/out: memory heap */ +#ifdef UNIV_DEBUG + MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result)); +#else /* UNIV_DEBUG */ + MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result)); +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \ + rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap) +#else /* UNIV_DEBUG */ +# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \ + rec_get_offsets_func(rec, index, offsets, leaf, n, heap) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /*!< in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint node_ptr,/*!< in: nonzero=node pointer, + 0=leaf node */ + rec_offs* offsets)/*!< in/out: array consisting of + offsets[0] allocated elements */ + MY_ATTRIBUTE((nonnull)); +#ifdef UNIV_DEBUG +/** Validate offsets returned by rec_get_offsets(). +@param[in] rec record, or NULL +@param[in] index the index that the record belongs in, or NULL +@param[in,out] offsets the offsets of the record +@return true */ +bool +rec_offs_validate( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets) + MY_ATTRIBUTE((nonnull(3), warn_unused_result)); +/** Update debug data in offsets, in order to tame rec_offs_validate(). +@param[in] rec record +@param[in] index the index that the record belongs in +@param[in] leaf whether the record resides in a leaf page +@param[in,out] offsets offsets from rec_get_offsets() to adjust */ +void +rec_offs_make_valid( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + rec_offs* offsets) + MY_ATTRIBUTE((nonnull)); +#else +# define rec_offs_make_valid(rec, index, leaf, offsets) +#endif /* UNIV_DEBUG */ + +/************************************************************//** +The following function is used to get the offset to the nth +data field in an old-style record. +@return offset to the field */ +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + MY_ATTRIBUTE((nonnull)); +#define rec_get_nth_field_old(rec, n, len) \ +((rec) + rec_get_nth_field_offs_old(rec, n, len)) +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ + MY_ATTRIBUTE((warn_unused_result)); +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +rec_offs +rec_get_nth_field_offs( +/*===================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + MY_ATTRIBUTE((nonnull)); +#define rec_get_nth_field(rec, offsets, n, len) \ +((rec) + rec_get_nth_field_offs(offsets, n, len)) +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Mark the nth field as externally stored. +@param[in] offsets array returned by rec_get_offsets() +@param[in] n nth field */ +void +rec_offs_make_nth_extern( + rec_offs* offsets, + const ulint n); + +MY_ATTRIBUTE((nonnull)) +/** Determine the number of allocated elements for an array of offsets. +@param[in] offsets offsets after rec_offs_set_n_alloc() +@return number of elements */ +inline ulint rec_offs_get_n_alloc(const rec_offs *offsets) +{ + ut_ad(offsets); + ulint n_alloc= offsets[0]; + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets); + return n_alloc; +} + +/** Determine the number of fields for which offsets have been initialized. +@param[in] offsets rec_get_offsets() +@return number of fields */ +inline +ulint +rec_offs_n_fields(const rec_offs* offsets) +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/** Get a flag of a record field. +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@param[in] flag flag to extract +@return type of the record field */ +inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return get_type(rec_offs_base(offsets)[1 + n]); +} + +/** Determine if a record field is missing +(should be replaced by dict_index_t::instant_field_value()). +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@return nonzero if default bit is set */ +inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == DEFAULT; +} + +/** Determine if a record field is SQL NULL +(should be replaced by dict_index_t::instant_field_value()). +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@return nonzero if SQL NULL set */ +inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == SQL_NULL; +} + +/** Determine if a record field is stored off-page. +@param[in] offsets rec_get_offsets() +@param[in] n nth field +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE; +} + +/** Get a global flag of a record. +@param[in] offsets rec_get_offsets() +@param[in] flag flag to extract +@return the flag of the record field */ +inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return *rec_offs_base(offsets) & flag; +} + +/** Determine if the offsets are for a record containing off-page columns. +@param[in] offsets rec_get_offsets() +@return nonzero if any off-page columns exist */ +inline bool rec_offs_any_extern(const rec_offs *offsets) +{ + return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL); +} + +/** Determine if the offsets are for a record that is missing fields. +@param[in] offsets rec_get_offsets() +@return nonzero if any fields need to be replaced with + dict_index_t::instant_field_value() */ +inline ulint rec_offs_any_default(const rec_offs *offsets) +{ + return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT); +} + +/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT. +@param[in] offsets rec_get_offsets() +@return nonzero if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline ulint rec_offs_comp(const rec_offs *offsets) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return (*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, ulint comp) +{ + bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG); + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_add_metadata(const rec_t* rec, ulint comp) +{ + bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG; + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp) +{ + bool is = !(~rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)); + ut_ad(!is || rec_is_metadata(rec, comp)); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_dummy || index.is_instant()); + return is; +} + +/** Determine if a record is delete-marked (not a metadata pseudo-record). +@param[in] rec record +@param[in] comp nonzero if ROW_FORMAT!=REDUNDANT +@return whether the record is a delete-marked user record */ +inline bool rec_is_delete_marked(const rec_t* rec, ulint comp) +{ + return (rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) + == REC_INFO_DELETED_FLAG; +} + +/** Get the nth field from an index. +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] n field number +@param[out] len length of the field in bytes, or UNIV_SQL_NULL +@return a read-only copy of the index field */ +inline +const byte* +rec_get_nth_cfield( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + ulint n, + ulint* len) +{ + /* Because this function may be invoked by innobase_rec_to_mysql() + for reporting a duplicate key during ALTER TABLE or + CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size + header of 5 or 6 bytes, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + if (!rec_offs_nth_default(offsets, n)) { + return rec_get_nth_field(rec, offsets, n, len); + } + return index->instant_field_value(n, len); +} + +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + rec_offs*offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ + MY_ATTRIBUTE((nonnull)); +#define rec_offs_init(offsets) \ + rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns the total size of record minus data size of record. +The value returned by the function is the distance from record +start to record origin in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +#else /* UNIV_DEBUG */ +# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets)) +# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets)) +#endif /* UNIV_DEBUG */ + +/** Copy a physical record to a buffer. +@param[in] buf buffer +@param[in] rec physical record +@param[in] offsets array returned by rec_get_offsets() +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( + void* buf, + const rec_t* rec, + const rec_offs* offsets); + +/** Determine the size of a data tuple prefix in a temporary file. +@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format +@param[in] index clustered or secondary index +@param[in] fields data fields +@param[in] n_fields number of data fields +@param[out] extra record header size +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT +@return total size, in bytes */ +template<bool redundant_temp> +ulint +rec_get_converted_size_temp( + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + ulint* extra, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +@param[in] n_core number of core fields (index->n_core_fields) +@param[in] def_val default values for non-core fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ +void +rec_init_offsets_temp( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + const dict_col_t::def_t*def_val, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((nonnull(1,2,3))); +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +*/ +void +rec_init_offsets_temp( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets) + MY_ATTRIBUTE((nonnull)); + +/** Convert a data tuple prefix to the temporary file format. +@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format +@param[out] rec record in temporary file format +@param[in] index clustered or secondary index +@param[in] fields data fields +@param[in] n_fields number of data fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ +template<bool redundant_temp> +void +rec_convert_dtuple_to_temp( + rec_t* rec, + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((nonnull)); + +/**************************************************************//** +Copies the first n fields of a physical record to a new physical record in +a buffer. +@return own: copied record */ +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + byte** buf, /*!< in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size) /*!< in/out: buffer size */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************//** +Builds a physical record out of a data tuple and +stores it into the given buffer. +@return pointer to the origin of physical record */ +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + byte* buf, /*!< in: start address of the + physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of + externally stored columns */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ + MY_ATTRIBUTE((const)); +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))); + +/** Determine the size of a record in ROW_FORMAT=COMPACT. +@param[in] index record descriptor. dict_table_is_comp() + is assumed to hold, even if it doesn't +@param[in] tuple logical record +@param[out] extra extra size +@return total size */ +ulint +rec_get_converted_size_comp( + const dict_index_t* index, + const dtuple_t* tuple, + ulint* extra) + MY_ATTRIBUTE((nonnull(1,2))); + +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/** Copy the first n fields of a (copy of a) physical record to a data tuple. +The fields are copied into the memory heap. +@param[out] tuple data tuple +@param[in] rec index record, or a copy thereof +@param[in] index index of rec +@param[in] n_core index->n_core_fields at the time rec was + copied, or 0 if non-leaf page record +@param[in] n_fields number of fields to copy +@param[in,out] heap memory heap */ +void +rec_copy_prefix_to_dtuple( + dtuple_t* tuple, + const rec_t* rec, + const dict_index_t* index, + ulint n_core, + ulint n_fields, + mem_heap_t* heap) + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Validates the consistency of a physical record. +@return TRUE if ok */ +ibool +rec_validate( +/*=========*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints an old-style physical record. */ +void +rec_print_old( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints a spatial index record. */ +void +rec_print_mbr_rec( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints a physical record. */ +void +rec_print_new( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints a physical record. */ +void +rec_print( +/*======*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + MY_ATTRIBUTE((nonnull)); + +/** Pretty-print a record. +@param[in,out] o output stream +@param[in] rec physical record +@param[in] info rec_get_info_bits(rec) +@param[in] offsets rec_get_offsets(rec) */ +void +rec_print( + std::ostream& o, + const rec_t* rec, + ulint info, + const rec_offs* offsets); + +/** Wrapper for pretty-printing a record */ +struct rec_index_print +{ + /** Constructor */ + rec_index_print(const rec_t* rec, const dict_index_t* index) : + m_rec(rec), m_index(index) + {} + + /** Record */ + const rec_t* m_rec; + /** Index */ + const dict_index_t* m_index; +}; + +/** Display a record. +@param[in,out] o output stream +@param[in] r record to display +@return the output stream */ +std::ostream& +operator<<(std::ostream& o, const rec_index_print& r); + +/** Wrapper for pretty-printing a record */ +struct rec_offsets_print +{ + /** Constructor */ + rec_offsets_print(const rec_t* rec, const rec_offs* offsets) : + m_rec(rec), m_offsets(offsets) + {} + + /** Record */ + const rec_t* m_rec; + /** Offsets to each field */ + const rec_offs* m_offsets; +}; + +/** Display a record. +@param[in,out] o output stream +@param[in] r record to display +@return the output stream */ +ATTRIBUTE_COLD +std::ostream& +operator<<(std::ostream& o, const rec_offsets_print& r); + +/** Pretty-printer of records and tuples */ +class rec_printer : public std::ostringstream { +public: + /** Construct a pretty-printed record. + @param rec record with header + @param offsets rec_get_offsets(rec, ...) */ + ATTRIBUTE_COLD + rec_printer(const rec_t* rec, const rec_offs* offsets) + : + std::ostringstream () + { + rec_print(*this, rec, + rec_get_info_bits(rec, rec_offs_comp(offsets)), + offsets); + } + + /** Construct a pretty-printed record. + @param rec record, possibly lacking header + @param info rec_get_info_bits(rec) + @param offsets rec_get_offsets(rec, ...) */ + ATTRIBUTE_COLD + rec_printer(const rec_t* rec, ulint info, const rec_offs* offsets) + : + std::ostringstream () + { + rec_print(*this, rec, info, offsets); + } + + /** Construct a pretty-printed tuple. + @param tuple data tuple */ + ATTRIBUTE_COLD + rec_printer(const dtuple_t* tuple) + : + std::ostringstream () + { + dtuple_print(*this, tuple); + } + + /** Construct a pretty-printed tuple. + @param field array of data tuple fields + @param n number of fields */ + ATTRIBUTE_COLD + rec_printer(const dfield_t* field, ulint n) + : + std::ostringstream () + { + dfield_print(*this, field, n); + } + + /** Destructor */ + ~rec_printer() override {} + +private: + /** Copy constructor */ + rec_printer(const rec_printer& other); + /** Assignment operator */ + rec_printer& operator=(const rec_printer& other); +}; + + +# ifdef UNIV_DEBUG +/** Read the DB_TRX_ID of a clustered index record. +@param[in] rec clustered index record +@param[in] index clustered index +@return the value of DB_TRX_ID */ +trx_id_t +rec_get_trx_id( + const rec_t* rec, + const dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ + +/* Maximum lengths for the data in a physical record if the offsets +are given in one byte (resp. two byte) format. */ +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL + +/* The data size of record must not be larger than this on +REDUNDANT row format because we reserve two upmost bits in a +two byte offset for special purposes */ +#define REDUNDANT_REC_MAX_DATA_SIZE (16383) + +/* The data size of record must be smaller than this on +COMPRESSED row format because we reserve two upmost bits in a +two byte offset for special purposes */ +#define COMPRESSED_REC_MAX_DATA_SIZE (16384) + +#ifdef WITH_WSREP +int wsrep_rec_get_foreign_key( + byte *buf, /* out: extracted key */ + ulint *buf_len, /* in/out: length of buf */ + const rec_t* rec, /* in: physical record */ + dict_index_t* index_for, /* in: index for foreign table */ + dict_index_t* index_ref, /* in: index for referenced table */ + ibool new_protocol); /* in: protocol > 1 */ +#endif /* WITH_WSREP */ + +#include "rem0rec.ic" + +#endif /* !UNIV_INNOCHECKSUM */ +#endif /* rem0rec_h */ diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic new file mode 100644 index 00000000..30c72a74 --- /dev/null +++ b/storage/innobase/include/rem0rec.ic @@ -0,0 +1,1204 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.ic +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mach0data.h" +#include "ut0byte.h" +#include "dict0boot.h" +#include "btr0types.h" + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits pointer to next record + 2 8 bits pointer to next record + 3 1 bit short flag + 7 bits number of fields + 4 3 bits number of fields + 5 bits heap number + 5 8 bits heap number + 6 4 bits n_owned + 4 bits info bits +*/ + +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod srv_page_size + 3 3 bits status: + 000=REC_STATUS_ORDINARY + 001=REC_STATUS_NODE_PTR + 010=REC_STATUS_INFIMUM + 011=REC_STATUS_SUPREMUM + 100=REC_STATUS_INSTANT + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + +/* We list the byte offsets from the origin of the record, the mask, +and the shift needed to obtain each bit-field of the record. */ + +#define REC_NEXT 2 +#define REC_NEXT_MASK 0xFFFFUL +#define REC_NEXT_SHIFT 0 + +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 + +#define REC_OLD_HEAP_NO 5 +#define REC_HEAP_NO_MASK 0xFFF8UL +#if 0 /* defined in rem0rec.h for use of page0zip.cc */ +#define REC_NEW_HEAP_NO 4 +#define REC_HEAP_NO_SHIFT 3 +#endif + +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ +#define REC_N_OWNED_MASK 0xFUL +#define REC_N_OWNED_SHIFT 0 + +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ +#define REC_INFO_BITS_MASK 0xF0UL +#define REC_INFO_BITS_SHIFT 0 + +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif + +/******************************************************//** +Gets a bit field from within 1 byte. */ +UNIV_INLINE +byte +rec_get_bit_field_1( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + return static_cast<byte>((*(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 1 byte. */ +UNIV_INLINE +void +rec_set_bit_field_1( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask); + ut_ad(mask <= 0xFFUL); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_1(rec - offs, + (mach_read_from_1(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +Gets a bit field from within 2 bytes. */ +UNIV_INLINE +ulint +rec_get_bit_field_2( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_2(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 2 bytes. */ +UNIV_INLINE +void +rec_set_bit_field_2( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); + ut_ad((mask >> shift) & 1); + ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_2(rec - offs, + (mach_read_from_2(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; + + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (field_value == 0) { + + return(NULL); + } + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < srv_page_size + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, srv_page_size) + < srv_page_size); +#endif + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return((byte*) ut_align_down(rec, srv_page_size) + + ut_align_offset(rec + field_value, srv_page_size)); + } else { + ut_ad(field_value < srv_page_size); + + return((byte*) ut_align_down(rec, srv_page_size) + + field_value); + } +} + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp))); +} + +/******************************************************//** +The following function is used to get the offset of the next chained record +on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < srv_page_size + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, srv_page_size) + < srv_page_size); +#endif + if (field_value == 0) { + + return(0); + } + + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return(ut_align_offset(rec + field_value, srv_page_size)); + } else { + ut_ad(field_value < srv_page_size); + + return(field_value); + } +} + +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ut_ad(srv_page_size > next); + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); + mach_write_to_2(rec - REC_NEXT, next); +} + +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ulint field_value; + + ut_ad(srv_page_size > next); + + if (!next) { + field_value = 0; + } else { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint) + ((lint) next + - (lint) ut_align_offset(rec, srv_page_size)); + field_value &= REC_NEXT_MASK; + } + + mach_write_to_2(rec - REC_NEXT, field_value); +} + +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, + REC_OLD_N_FIELDS_SHIFT); + ut_ad(ret <= REC_MAX_N_FIELDS); + ut_ad(ret > 0); + + return(ret); +} + +/******************************************************//** +The following function is used to set the number of fields +in an old-style record. */ +UNIV_INLINE +void +rec_set_n_fields_old( +/*=================*/ + rec_t* rec, /*!< in: physical record */ + ulint n_fields) /*!< in: the number of fields */ +{ + ut_ad(rec); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields > 0); + + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + return(rec_get_n_fields_old(rec)); + } + + switch (rec_get_status(rec)) { + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + } + + ut_error; + return(ULINT_UNDEFINED); +} + +/** Confirms the n_fields of the entry is sane with comparing the other +record in the same page specified +@param[in] index index +@param[in] rec record of the same page +@param[in] entry index entry +@return true if n_fields is sane */ +UNIV_INLINE +bool +rec_n_fields_is_sane( + dict_index_t* index, + const rec_t* rec, + const dtuple_t* entry) +{ + const ulint n_fields = rec_get_n_fields(rec, index); + + return(n_fields == dtuple_get_n_fields(entry) + || (index->is_instant() + && n_fields >= index->n_core_fields) + /* a record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + || (index->table->id == DICT_INDEXES_ID + && n_fields == dtuple_get_n_fields(entry) - 1)); +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to retrieve the info bits of a record. +@return info bits */ +UNIV_INLINE +byte +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return rec_get_bit_field_1( + rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info and status bits */ +UNIV_INLINE +byte +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) + & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + if (comp) + return static_cast<byte>(rec_get_info_bits(rec, TRUE) | + rec_get_status(rec)); + else + return rec_get_info_bits(rec, FALSE); +} +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ +{ + compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) + & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK, + REC_NEW_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + if (comp) { + return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } else { + return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } +} + +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +bool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to test whether the data offsets in the record +are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); +} + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ +{ + ut_ad(flag <= 1); + + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +uint8_t +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +uint16_t +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); +} + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK); +} + +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + rec_offs*offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ +{ + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + MEM_UNDEFINED(offsets, n_alloc * sizeof *offsets); + offsets[0] = static_cast<rec_offs>(n_alloc); +} + +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +rec_offs +rec_get_nth_field_offs( +/*===================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null; UNIV_SQL_DEFAULT is default value */ +{ + ut_ad(n < rec_offs_n_fields(offsets)); + + rec_offs offs = n == 0 ? 0 : get_value(rec_offs_base(offsets)[n]); + rec_offs next_offs = rec_offs_base(offsets)[1 + n]; + + if (get_type(next_offs) == SQL_NULL) { + *len = UNIV_SQL_NULL; + } else if (get_type(next_offs) == DEFAULT) { + *len = UNIV_SQL_DEFAULT; + } else { + *len = get_value(next_offs) - offs; + } + + return(offs); +} + +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */ +{ + ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(NULL); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field + = rec_get_nth_field(rec, offsets, i, &len); + + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + if (!memcmp(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + return(field); + } + } + } + + return(NULL); +} + +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + if (!n) { + return get_value(rec_offs_base(offsets)[1 + n]); + } + return get_value((rec_offs_base(offsets)[1 + n])) + - get_value(rec_offs_base(offsets)[n]); +} + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n = 0; + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { + n++; + } + } + } + + return(n); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. This function and the 2-byte counterpart are defined here because the +C-compiler was not able to sum negative and positive constant offsets, and +warned of constant arithmetic overflow within the compiler. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_2_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +1-byte format. */ +UNIV_INLINE +void +rec_1_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +2-byte format. */ +UNIV_INLINE +void +rec_2_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 1-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_1_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_1_get_prev_field_end_info(rec, n) + & ~REC_1BYTE_SQL_NULL_MASK); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 2-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_2_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_2_get_prev_field_end_info(rec, n) + & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK)); +} + +/******************************************************//** +The following function is used to read the offset of the start of a data field +in the record. The start of an SQL null field is the end offset of the +previous non-null field, or 0, if none exists. If n is the number of the last +field + 1, then the end offset of the last field is returned. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_get_field_start_offs( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + if (rec_get_1byte_offs_flag(rec)) { + + return(rec_1_get_field_start_offs(rec, n)); + } + + return(rec_2_get_field_start_offs(rec, n)); +} + +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ +{ + ulint os; + ulint next_os; + + os = rec_get_field_start_offs(rec, n); + next_os = rec_get_field_start_offs(rec, n + 1); + + ut_ad(next_os - os < srv_page_size); + + return(next_os - os); +} + +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ut_ad(rec); + + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); +} + +/**********************************************************//** +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + rec_offs* offsets, /*!< in/out: array returned by + rec_get_offsets() */ + ulint n_fields) /*!< in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = static_cast<rec_offs>(n_fields); +} + +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]); + ut_ad(size < srv_page_size); + return(size); +} + +/**********************************************************//** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & REC_OFFS_MASK; + ut_ad(size < srv_page_size); + return(size); +} + +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); +} + +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets))); +} + +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets))); +} +#endif /* UNIV_DEBUG */ + +/** Copy a physical record to a buffer. +@param[in] buf buffer +@param[in] rec physical record +@param[in] offsets array returned by rec_get_offsets() +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( + void* buf, + const rec_t* rec, + const rec_offs* offsets) +{ + ulint extra_len; + ulint data_len; + + ut_ad(rec != NULL); + ut_ad(buf != NULL); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); + + memcpy(buf, rec - extra_len, extra_len + data_len); + + return((byte*) buf + extra_len); +} + +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + return(REC_N_OLD_EXTRA_BYTES + n_fields); + } + + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); +} + +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + ulint data_size; + ulint extra_size; + + ut_ad(dtuple_check_typed(dtuple)); +#ifdef UNIV_DEBUG + if (dict_index_is_ibuf(index)) { + ut_ad(dtuple->n_fields > 1); + } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) { + ut_ad(dtuple->n_fields - 1 + == dict_index_get_n_unique_in_tree_nonleaf(index)); + } else if (index->table->id == DICT_INDEXES_ID) { + /* The column SYS_INDEXES.MERGE_THRESHOLD was + instantly added in MariaDB 10.2.2 (MySQL 5.7). */ + ut_ad(!index->table->is_temporary()); + ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES); + ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES + || dtuple->n_fields + == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD); + } else { + ut_ad(dtuple->n_fields >= index->n_core_fields); + ut_ad(dtuple->n_fields <= index->n_fields + || dtuple->is_alter_metadata()); + } +#endif + + if (dict_table_is_comp(index->table)) { + return rec_get_converted_size_comp(index, dtuple, NULL); + } + + data_size = dtuple_get_data_size(dtuple, 0); + + /* If primary key is being updated then the new record inherits + externally stored fields from the delete-marked old record. + In that case, n_ext may be less value than + dtuple_get_n_ext(tuple). */ + ut_ad(n_ext <= dtuple_get_n_ext(dtuple)); + extra_size = rec_get_converted_extra_size( + data_size, dtuple_get_n_fields(dtuple), n_ext); + + return(data_size + extra_size); +} diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h new file mode 100644 index 00000000..0e4075a9 --- /dev/null +++ b/storage/innobase/include/rem0types.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0types.h +Record manager global types + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0types_h +#define rem0types_h + +/* We define the physical record simply as an array of bytes */ +typedef byte rec_t; + +/** This type represents a field offset in a rec_t* */ +typedef unsigned short int rec_offs; + +/* Maximum values for various fields (for non-blob tuples) */ +#define REC_MAX_N_FIELDS (1024 - 1) +#define REC_MAX_HEAP_NO (2 * 8192 - 1) +#define REC_MAX_N_OWNED (16 - 1) + +/* Maximum number of user defined fields/columns. The reserved columns +are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR. +Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index() +created a dummy table object possibly, with some of the system columns +in it, and then adds the 3 system columns (again) using +dict_table_add_system_columns(). +For now, we will keep this limitation to maintain file format compatibility +with older versions. */ +#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2) + +/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum +indexed field length (or indexed prefix length) for indexes on tables of +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format. +Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character +may take at most 3 bytes. So the limit was set to 3*256, so that one +can create a column prefix index on 256 characters of a TEXT or VARCHAR +column also in the UTF-8 charset. +This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define REC_ANTELOPE_MAX_INDEX_COL_LEN 768 + +/** Maximum indexed field length for tables that have atomic BLOBs. +This (3072) is the maximum index row length allowed, so we cannot create index +prefix column longer than that. */ +#define REC_VERSION_56_MAX_INDEX_COL_LEN 3072 + +/** Innodb row types are a subset of the MySQL global enum row_type. +They are made into their own enum so that switch statements can account +for each of them. */ +enum rec_format_enum { + REC_FORMAT_REDUNDANT = 0, /*!< REDUNDANT row format */ + REC_FORMAT_COMPACT = 1, /*!< COMPACT row format */ + REC_FORMAT_COMPRESSED = 2, /*!< COMPRESSED row format */ + REC_FORMAT_DYNAMIC = 3 /*!< DYNAMIC row format */ +}; +typedef enum rec_format_enum rec_format_t; + +#endif diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h new file mode 100644 index 00000000..251f3125 --- /dev/null +++ b/storage/innobase/include/row0ext.h @@ -0,0 +1,101 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.h +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#ifndef row0ext_h +#define row0ext_h + +#include "data0types.h" +#include "mem0mem.h" +#include "dict0types.h" +#include "fsp0types.h" +#include "row0types.h" + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + const dict_table_t& table, /*!< in: table */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + mem_heap_t* heap); /*!< in: heap where created */ + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ + +/** Prefixes of externally stored columns */ +struct row_ext_t{ + ulint n_ext; /*!< number of externally stored columns */ + const ulint* ext; /*!< col_no's of externally stored columns */ + byte* buf; /*!< backing store of the column prefix cache */ + ulint max_len;/*!< maximum prefix length, it could be + REC_ANTELOPE_MAX_INDEX_COL_LEN or + REC_VERSION_56_MAX_INDEX_COL_LEN depending + on row format */ + ulint zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */ + ulint len[1]; /*!< prefix lengths; 0 if not cached */ +}; + +#include "row0ext.ic" + +#endif diff --git a/storage/innobase/include/row0ext.ic b/storage/innobase/include/row0ext.ic new file mode 100644 index 00000000..913b51b3 --- /dev/null +++ b/storage/innobase/include/row0ext.ic @@ -0,0 +1,87 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.ic +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "rem0types.h" +#include "btr0types.h" + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ut_ad(ext); + ut_ad(len); + ut_ad(i < ext->n_ext); + + *len = ext->len[i]; + + ut_ad(*len <= ext->max_len); + ut_ad(ext->max_len > 0); + + if (*len == 0) { + /* The BLOB could not be fetched to the cache. */ + return(field_ref_zero); + } else { + return(ext->buf + i * ext->max_len); + } +} + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ulint i; + + ut_ad(ext); + ut_ad(len); + + for (i = 0; i < ext->n_ext; i++) { + if (col == ext->ext[i]) { + return(row_ext_lookup_ith(ext, i, len)); + } + } + + return(NULL); +} diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h new file mode 100644 index 00000000..99c85601 --- /dev/null +++ b/storage/innobase/include/row0ftsort.h @@ -0,0 +1,265 @@ +/***************************************************************************** + +Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ftsort.h +Create Full Text Index with (parallel) merge sort + +Created 10/13/2010 Jimmy Yang +*******************************************************/ + +#ifndef row0ftsort_h +#define row0ftsort_h + +#include "data0data.h" +#include "fts0fts.h" +#include "fts0priv.h" +#include "rem0types.h" +#include "row0merge.h" +#include "btr0bulk.h" +#include "srv0srv.h" + +/** This structure defineds information the scan thread will fetch +and put to the linked list for parallel tokenization/sort threads +to process */ +typedef struct fts_doc_item fts_doc_item_t; + +/** Information about temporary files used in merge sort */ +struct fts_doc_item { + dfield_t* field; /*!< field contains document string */ + doc_id_t doc_id; /*!< document ID */ + UT_LIST_NODE_T(fts_doc_item_t) doc_list; + /*!< list of doc items */ +}; + +/** This defines the list type that scan thread would feed the parallel +tokenization threads and sort threads. */ +typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t; + +#define FTS_PLL_MERGE 1 + +/** Sort information passed to each individual parallel sort thread */ +struct fts_psort_t; + +/** Common info passed to each parallel sort thread */ +struct fts_psort_common_t { + row_merge_dup_t* dup; /*!< descriptor of FTS index */ + dict_table_t* new_table; /*!< source table */ + /** Old table page size */ + ulint old_zip_size; + trx_t* trx; /*!< transaction */ + fts_psort_t* all_info; /*!< all parallel sort info */ + os_event_t sort_event; /*!< sort event */ + ibool opt_doc_id_size;/*!< whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort, if + Doc ID will not be big enough + to use 8 bytes value */ +}; + +struct fts_psort_t { + ulint psort_id; /*!< Parallel sort ID */ + row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX]; + /*!< sort buffer */ + merge_file_t* merge_file[FTS_NUM_AUX_INDEX]; + /*!< sort file */ + row_merge_block_t* merge_block[FTS_NUM_AUX_INDEX]; + /*!< buffer to write to file */ + row_merge_block_t* crypt_block[FTS_NUM_AUX_INDEX]; + /*!< buffer to crypt data */ + ulint child_status; /*!< child task status */ + ulint state; /*!< parent state */ + fts_doc_list_t fts_doc_list; /*!< doc list to process */ + fts_psort_common_t* psort_common; /*!< ptr to all psort info */ + tpool::waitable_task* task; /*!< threadpool task */ + dberr_t error; /*!< db error during psort */ + ulint memory_used; /*!< memory used by fts_doc_list */ + ib_mutex_t mutex; /*!< mutex for fts_doc_list */ +}; + +/** Row fts token for plugin parser */ +struct row_fts_token_t { + fts_string_t* text; /*!< token */ + UT_LIST_NODE_T(row_fts_token_t) + token_list; /*!< next token link */ +}; + +typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t; + +/** Structure stores information from string tokenization operation */ +struct fts_tokenize_ctx { + ulint processed_len; /*!< processed string length */ + ulint init_pos; /*!< doc start position */ + ulint buf_used; /*!< the sort buffer (ID) when + tokenization stops, which + could due to sort buffer full */ + ulint rows_added[FTS_NUM_AUX_INDEX]; + /*!< number of rows added for + each FTS index partition */ + ib_rbt_t* cached_stopword;/*!< in: stopword list */ + dfield_t sort_field[FTS_NUM_FIELDS_SORT]; + /*!< in: sort field */ + fts_token_list_t fts_token_list; + + fts_tokenize_ctx() : + processed_len(0), init_pos(0), buf_used(0), + rows_added(), cached_stopword(NULL), sort_field(), + fts_token_list() + { + memset(rows_added, 0, sizeof rows_added); + memset(sort_field, 0, sizeof sort_field); + UT_LIST_INIT(fts_token_list, &row_fts_token_t::token_list); + } +}; + +typedef struct fts_tokenize_ctx fts_tokenize_ctx_t; + +/** Structure stores information needed for the insertion phase of FTS +parallel sort. */ +struct fts_psort_insert { + CHARSET_INFO* charset; /*!< charset info */ + mem_heap_t* heap; /*!< heap */ + ibool opt_doc_id_size;/*!< Whether to use smaller (4 bytes) + integer for Doc ID */ + BtrBulk* btr_bulk; /*!< Bulk load instance */ + dtuple_t* tuple; /*!< Tuple to insert */ + +#ifdef UNIV_DEBUG + ulint aux_index_id; /*!< Auxiliary index id */ +#endif +}; + +typedef struct fts_psort_insert fts_psort_insert_t; + + +/** status bit used for communication between parent and child thread */ +#define FTS_PARENT_COMPLETE 1 +#define FTS_PARENT_EXITING 2 +#define FTS_CHILD_COMPLETE 1 +#define FTS_CHILD_EXITING 2 + +/** Print some debug information */ +#define FTSORT_PRINT + +#ifdef FTSORT_PRINT +#define DEBUG_FTS_SORT_PRINT(str) \ + do { \ + ut_print_timestamp(stderr); \ + fprintf(stderr, str); \ + } while (0) +#else +#define DEBUG_FTS_SORT_PRINT(str) +#endif /* FTSORT_PRINT */ + +/*************************************************************//** +Create a temporary "fts sort index" used to merge sort the +tokenized doc string. The index has three "fields": + +1) Tokenized word, +2) Doc ID +3) Word's position in original 'doc'. + +@return dict_index_t structure for the fts sort index */ +dict_index_t* +row_merge_create_fts_sort_index( +/*============================*/ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + dict_table_t* table, /*!< in,out: table that FTS index + is being created on */ + ibool* opt_doc_id_size); + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + +/** Initialize FTS parallel sort structures. +@param[in] trx transaction +@param[in,out] dup descriptor of FTS index being created +@param[in] new_table table where indexes are created +@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes + integer to store Doc ID during sort +@param[in] old_zip_size page size of the old table during alter +@param[out] psort parallel sort info to be instantiated +@param[out] merge parallel merge info to be instantiated +@return true if all successful */ +bool +row_fts_psort_info_init( + trx_t* trx, + row_merge_dup_t*dup, + dict_table_t* new_table, + bool opt_doc_id_size, + ulint old_zip_size, + fts_psort_t** psort, + fts_psort_t** merge) + MY_ATTRIBUTE((nonnull)); + +/********************************************************************//** +Clean up and deallocate FTS parallel sort structures, and close +temparary merge sort files */ +void +row_fts_psort_info_destroy( +/*=======================*/ + fts_psort_t* psort_info, /*!< parallel sort info */ + fts_psort_t* merge_info); /*!< parallel merge info */ +/********************************************************************//** +Free up merge buffers when merge sort is done */ +void +row_fts_free_pll_merge_buf( +/*=======================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ + +/*********************************************************************//** +Start the parallel tokenization and parallel merge sort */ +void +row_fts_start_psort( +/*================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ +/*********************************************************************//** +Kick off the parallel merge and insert thread */ +void +row_fts_start_parallel_merge( +/*=========================*/ + fts_psort_t* merge_info); /*!< in: parallel sort info */ +/********************************************************************//** +Propagate a newly added record up one level in the selection tree +@return parent where this value propagated to */ +int +row_merge_fts_sel_propagate( +/*========================*/ + int propogated, /*<! in: tree node propagated */ + int* sel_tree, /*<! in: selection tree */ + ulint level, /*<! in: selection tree level */ + const mrec_t** mrec, /*<! in: sort record */ + rec_offs** offsets, /*<! in: record offsets */ + dict_index_t* index); /*<! in: FTS index */ +/********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +dberr_t +row_fts_merge_insert( +/*=================*/ + dict_index_t* index, /*!< in: index */ + dict_table_t* table, /*!< in: new table */ + fts_psort_t* psort_info, /*!< parallel sort info */ + ulint id) /* !< in: which auxiliary table's data + to insert to */ + MY_ATTRIBUTE((nonnull)); +#endif /* row0ftsort_h */ diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h new file mode 100644 index 00000000..fd2651da --- /dev/null +++ b/storage/innobase/include/row0import.h @@ -0,0 +1,67 @@ +/***************************************************************************** + +Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.h +Header file for import tablespace functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0import_h +#define row0import_h + +#include "dict0types.h" + +// Forward declarations +struct trx_t; +struct dict_table_t; +struct row_prebuilt_t; + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct + in MySQL */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN. +@param[in,out] trx dictionary transaction +@param[in] table_id table identifier +@param[in] discarded whether to set or clear the flag +@return DB_SUCCESS or error code */ +dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id, + bool discarded) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Update the root page numbers and tablespace ID of a table. +@param[in,out] trx dictionary transaction +@param[in,out] table persistent table +@param[in] reset whether to reset the fields to FIL_NULL +@return DB_SUCCESS or error code */ +dberr_t +row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#endif /* row0import_h */ diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h new file mode 100644 index 00000000..9a16394a --- /dev/null +++ b/storage/innobase/include/row0ins.h @@ -0,0 +1,224 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ins.h +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0ins_h +#define row0ins_h + +#include "data0data.h" +#include "que0types.h" +#include "trx0types.h" +#include "row0types.h" +#include <vector> + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_foreign_key_check_lock. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or +DB_ROW_IS_REFERENCED */ +dberr_t +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE If we want to check that + the referenced table is ok, FALSE if we + want to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row); /*!< in: new row (or first row) for the node */ +/***************************************************************//** +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread or NULL */ + MY_ATTRIBUTE((warn_unused_result)); + +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); + +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ + MY_ATTRIBUTE((warn_unused_result)); +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + bool check_foreign = true) /*!< in: true if check + foreign table is needed, false otherwise */ + MY_ATTRIBUTE((warn_unused_result)); +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in +SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* Insert node types */ +#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ +#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ +#define INS_DIRECT 2 /* this is for internal use in dict0crea: + insert the row directly */ + +/* Node execution states */ +#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ +#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ +#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and + inserted */ + +struct row_prebuilt_t; + +/** Insert node structure */ +struct ins_node_t +{ + explicit ins_node_t(ulint ins_type, dict_table_t *table) : + common(QUE_NODE_INSERT, NULL), + ins_type(ins_type), + row(NULL), table(table), select(NULL), values_list(NULL), + state(INS_NODE_SET_IX_LOCK), index(NULL), + entry_list(), entry(entry_list.end()), + trx_id(0), entry_sys_heap(mem_heap_create(128)) + { + } + que_common_t common; /*!< node type: QUE_NODE_INSERT */ + ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */ + dtuple_t* row; /*!< row to insert */ + dict_table_t* table; /*!< table where to insert */ + sel_node_t* select; /*!< select in searched insert */ + que_node_t* values_list;/* list of expressions to evaluate and + insert in an INS_VALUES insert */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index where the index + entry should be inserted */ + std::vector<dtuple_t*> + entry_list;/* list of entries, one for each index */ + std::vector<dtuple_t*>::iterator + entry; /*!< NULL, or entry to insert in the index; + after a successful insert of the entry, + this should be reset to NULL */ + /** buffer for the system columns */ + byte sys_buf[DATA_ROW_ID_LEN + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + trx_id_t trx_id; /*!< trx id or the last trx which executed the + node */ + byte vers_start_buf[8]; /* Buffers for System Versioning */ + byte vers_end_buf[8]; /* system fields. */ + mem_heap_t* entry_sys_heap; + /* memory heap used as auxiliary storage; + entry_list and sys fields are stored here; + if this is NULL, entry list should be created + and buffers for sys fields in row allocated */ + void vers_update_end(row_prebuilt_t *prebuilt, bool history_row); + bool vers_history_row() const; /* true if 'row' is historical */ +}; + +/** Create an insert object. +@param ins_type INS_VALUES, ... +@param table table where to insert +@param heap memory heap +@return the created object */ +inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table, + mem_heap_t *heap) +{ + return new (mem_heap_alloc(heap, sizeof(ins_node_t))) + ins_node_t(ins_type, table); +} + +#endif diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h new file mode 100644 index 00000000..88fce314 --- /dev/null +++ b/storage/innobase/include/row0log.h @@ -0,0 +1,268 @@ +/***************************************************************************** + +Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.h +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#ifndef row0log_h +#define row0log_h + +#include "que0types.h" +#include "mtr0types.h" +#include "row0types.h" +#include "rem0types.h" +#include "data0types.h" +#include "trx0types.h" + +class ut_stage_alter_t; + +extern Atomic_counter<ulint> onlineddl_rowlog_rows; +extern ulint onlineddl_rowlog_pct_used; +extern ulint onlineddl_pct_progress; + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +bool +row_log_allocate( +/*=============*/ + const trx_t* trx, /*!< in: the ALTER TABLE transaction */ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* defaults, + /*!< in: default values of + added, changed columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ + const char* path, /*!< in: where to create temporary file */ + const TABLE* old_table, /*!< in:table definition before alter */ + bool allow_not_null) /*!< in: allow null to non-null + conversion */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/******************************************************//** +Free the row log for an index that was being created online. */ +void +row_log_free( +/*=========*/ + row_log_t* log) /*!< in,own: row log */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*==============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + ATTRIBUTE_COLD __attribute__((nonnull)); + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check whether a virtual column is indexed in the new table being +created during alter table +@param[in] index cluster index +@param[in] v_no virtual column number +@return true if it is indexed, else false */ +bool +row_log_col_is_indexed( + const dict_index_t* index, + ulint v_no); + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should + be logged, or NULL to use those in rec */ + ATTRIBUTE_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Logs an update operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk);/*!< in: row_log_table_get_pk() + before the update */ + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index), + or NULL */ + byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for + row_log_table_delete(), or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ + ATTRIBUTE_COLD __attribute__((nonnull(1,2,5), warn_unused_result)); + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets);/*!< in: rec_get_offsets(rec,index) */ +/******************************************************//** +Notes that a BLOB is being freed during online ALTER TABLE. */ +void +row_log_table_blob_free( +/*====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ + ATTRIBUTE_COLD __attribute__((nonnull)); +/******************************************************//** +Notes that a BLOB is being allocated during online ALTER TABLE. */ +void +row_log_table_blob_alloc( +/*=====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ + ATTRIBUTE_COLD __attribute__((nonnull)); + +/** Apply the row_log_table log to a table upon completing rebuild. +@param[in] thr query graph +@param[in] old_table old table +@param[in,out] table MySQL table (for reporting duplicates) +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_log_table() will be called initially and then +stage->inc() will be called for each block of log that is applied. +@param[in] new_table Altered table +@return DB_SUCCESS, or error code on failure */ +dberr_t +row_log_table_apply( + que_thr_t* thr, + dict_table_t* old_table, + struct TABLE* table, + ut_stage_alter_t* stage, + dict_table_t* new_table) + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Apply the row log to the index upon completing index creation. +@param[in] trx transaction (for checking if the operation was +interrupted) +@param[in,out] index secondary index +@param[in,out] table MySQL table (for reporting duplicates) +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_log_index() will be called initially and then +stage->inc() will be called for each block of log that is applied. +@return DB_SUCCESS, or error code on failure */ +dberr_t +row_log_apply( + const trx_t* trx, + dict_index_t* index, + struct TABLE* table, + ut_stage_alter_t* stage) + MY_ATTRIBUTE((warn_unused_result)); + +/** Get the n_core_fields of online log for the index +@param index index whose n_core_fields of log to be accessed +@return number of n_core_fields */ +unsigned row_log_get_n_core_fields(const dict_index_t *index); + +#ifdef HAVE_PSI_STAGE_INTERFACE +/** Estimate how much work is to be done by the log apply phase +of an ALTER TABLE for this index. +@param[in] index index whose log to assess +@return work to be done by log-apply in abstract units +*/ +ulint +row_log_estimate_work( + const dict_index_t* index); +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +#include "row0log.ic" + +#endif /* row0log.h */ diff --git a/storage/innobase/include/row0log.ic b/storage/innobase/include/row0log.ic new file mode 100644 index 00000000..44d17bbc --- /dev/null +++ b/storage/innobase/include/row0log.ic @@ -0,0 +1,84 @@ +/***************************************************************************** + +Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.ic +Modification log for online index creation and online table rebuild + +Created 2012-10-18 Marko Makela +*******************************************************/ + +#include "dict0dict.h" + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*===============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ +{ + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)); + + ut_ad(!dict_index_is_clust(index)); + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + row_log_free(index->online_log); + index->online_log = NULL; +} + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ + + ut_ad(rw_lock_own_flagged( + dict_index_get_lock(index), + RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + The caller must perform the operation on the + index tree directly. */ + return(false); + case ONLINE_INDEX_CREATION: + /* The index is being created online. Log the + operation. */ + row_log_online_op(index, tuple, trx_id); + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* The index was created online, but the operation was + aborted. Do not log the operation and tell the caller + to skip the operation. */ + break; + } + + return(true); +} diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h new file mode 100644 index 00000000..1d7f9bb1 --- /dev/null +++ b/storage/innobase/include/row0merge.h @@ -0,0 +1,464 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0merge.h +Index build routines using a merge sort + +Created 13/06/2005 Jan Lindstrom +*******************************************************/ + +#ifndef row0merge_h +#define row0merge_h + +#include "que0types.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "rem0rec.h" +#include "btr0types.h" +#include "row0mysql.h" +#include "lock0types.h" +#include "srv0srv.h" +#include "ut0stage.h" + +/* Reserve free space from every block for key_version */ +#define ROW_MERGE_RESERVE_SIZE 4 + +/* Cluster index read task is mandatory */ +#define COST_READ_CLUSTERED_INDEX 1.0 + +/* Basic fixed cost to build all type of index */ +#define COST_BUILD_INDEX_STATIC 0.5 +/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */ +#define COST_BUILD_INDEX_DYNAMIC 0.5 + +/* Sum of below two must be 1.0 */ +#define PCT_COST_MERGESORT_INDEX 0.4 +#define PCT_COST_INSERT_INDEX 0.6 + +// Forward declaration +struct ib_sequence_t; + +/** @brief Block size for I/O operations in merge sort. + +The minimum is srv_page_size, or page_get_free_space_of_empty() +rounded to a power of 2. + +When not creating a PRIMARY KEY that contains column prefixes, this +can be set as small as srv_page_size / 2. */ +typedef byte row_merge_block_t; + +/** @brief Secondary buffer for I/O operations of merge records. + +This buffer is used for writing or reading a record that spans two +row_merge_block_t. Thus, it must be able to hold one merge record, +whose maximum size is the same as the minimum size of +row_merge_block_t. */ +typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX]; + +/** @brief Merge record in row_merge_block_t. + +The format is the same as a record in ROW_FORMAT=COMPACT with the +exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ +typedef byte mrec_t; + +/** Merge record in row_merge_buf_t */ +struct mtuple_t { + dfield_t* fields; /*!< data fields */ +}; + +/** Buffer for sorting in main memory. */ +struct row_merge_buf_t { + mem_heap_t* heap; /*!< memory heap where allocated */ + dict_index_t* index; /*!< the index the tuples belong to */ + ulint total_size; /*!< total amount of data bytes */ + ulint n_tuples; /*!< number of data tuples */ + ulint max_tuples; /*!< maximum number of data tuples */ + mtuple_t* tuples; /*!< array of data tuples */ + mtuple_t* tmp_tuples; /*!< temporary copy of tuples, + for sorting */ +}; + +/** Information about temporary files used in merge sort */ +struct merge_file_t { + pfs_os_file_t fd; /*!< file descriptor */ + ulint offset; /*!< file offset (end of file) */ + ib_uint64_t n_rec; /*!< number of records in the file */ +}; + +/** Index field definition */ +struct index_field_t { + ulint col_no; /*!< column offset */ + ulint prefix_len; /*!< column prefix length, or 0 + if indexing the whole column */ + bool is_v_col; /*!< whether this is a virtual column */ +}; + +/** Definition of an index being created */ +struct index_def_t { + const char* name; /*!< index name */ + bool rebuild; /*!< whether the table is rebuilt */ + ulint ind_type; /*!< 0, DICT_UNIQUE, + or DICT_CLUSTERED */ + ulint key_number; /*!< MySQL key number, + or ULINT_UNDEFINED if none */ + ulint n_fields; /*!< number of fields in index */ + index_field_t* fields; /*!< field definitions */ + st_mysql_ftparser* + parser; /*!< fulltext parser plugin */ +}; + +/** Structure for reporting duplicate records. */ +struct row_merge_dup_t { + dict_index_t* index; /*!< index being sorted */ + struct TABLE* table; /*!< MySQL table object */ + const ulint* col_map;/*!< mapping of column numbers + in table to the rebuilt table + (index->table), or NULL if not + rebuilding table */ + ulint n_dup; /*!< number of duplicates */ +}; + +/*************************************************************//** +Report a duplicate key. */ +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Sets an exclusive lock on a table, for the duration of creating indexes. +@return error code or DB_SUCCESS */ +dberr_t +row_merge_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ + MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); + +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ + MY_ATTRIBUTE((nonnull)); + +/** Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. +@param trx dictionary transaction +@param table table containing the indexes +@param locked True if table is locked, + false - may need to do lazy drop +@param alter_trx Alter table transaction */ +void +row_merge_drop_indexes( + trx_t* trx, + dict_table_t* table, + bool locked, + const trx_t* alter_trx=NULL); + +/*********************************************************************//** +Drop all partially created indexes during crash recovery. */ +void +row_merge_drop_temp_indexes(void); +/*=============================*/ + +/** Create temporary merge files in the given paramater path, and if +UNIV_PFS_IO defined, register the file descriptor with Performance Schema. +@param[in] path location for creating temporary merge files, or NULL +@return File descriptor */ +pfs_os_file_t +row_merge_file_create_low( + const char* path) + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +void +row_merge_file_destroy_low( +/*=======================*/ + const pfs_os_file_t& fd); /*!< in: merge file descriptor */ + +/*********************************************************************//** +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/*********************************************************************//** +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Create the index and load in to the dictionary. +@param[in,out] table the index is on this table +@param[in] index_def the index definition +@param[in] add_v new virtual columns added along with add + index call +@return index, or NULL on error */ +dict_index_t* +row_merge_create_index( + dict_table_t* table, + const index_def_t* index_def, + const dict_add_v_col_t* add_v) + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Check if a transaction can use an index. +@return whether the index can be used by the transaction */ +bool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to check */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and +before calling this function. +@return DB_SUCCESS or error code */ +dberr_t +row_merge_drop_table( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table instance to drop */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Build indexes on a table by reading a clustered index, creating a temporary +file containing index entries, merge sorting these index entries and inserting +sorted index entries to indexes. +@param[in] trx transaction +@param[in] old_table table where rows are read from +@param[in] new_table table where indexes are created; identical to +old_table unless creating a PRIMARY KEY +@param[in] online true if creating indexes online +@param[in] indexes indexes to be created +@param[in] key_numbers MySQL key numbers +@param[in] n_indexes size of indexes[] +@param[in,out] table MySQL table, for reporting erroneous key value +if applicable +@param[in] defaults default values of added, changed columns, or NULL +@param[in] col_map mapping of old column numbers to new ones, or +NULL if old_table == new_table +@param[in] add_autoinc number of added AUTO_INCREMENT columns, or +ULINT_UNDEFINED if none is added +@param[in,out] sequence autoinc sequence +@param[in] skip_pk_sort whether the new PRIMARY KEY will follow +existing order +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of +this function and it will be passed to other functions for further accounting. +@param[in] add_v new virtual columns added along with indexes +@param[in] eval_table mysql table used to evaluate virtual column + value, see innobase_get_computed_value(). +@param[in] allow_non_null allow the conversion from null to not-null +@return DB_SUCCESS or error code */ +dberr_t +row_merge_build_indexes( + trx_t* trx, + dict_table_t* old_table, + dict_table_t* new_table, + bool online, + dict_index_t** indexes, + const ulint* key_numbers, + ulint n_indexes, + struct TABLE* table, + const dtuple_t* defaults, + const ulint* col_map, + ulint add_autoinc, + ib_sequence_t& sequence, + bool skip_pk_sort, + ut_stage_alter_t* stage, + const dict_add_v_col_t* add_v, + struct TABLE* eval_table, + bool allow_non_null) + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Write a buffer to a block. */ +void +row_merge_buf_write( +/*================*/ + const row_merge_buf_t* buf, /*!< in: sorted buffer */ + const merge_file_t* of, /*!< in: output file */ + row_merge_block_t* block) /*!< out: buffer for writing to file */ + MY_ATTRIBUTE((nonnull)); + +/********************************************************************//** +Sort a buffer. */ +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + MY_ATTRIBUTE((nonnull(1))); + +/********************************************************************//** +Write a merge block to the file system. +@return whether the request was completed successfully +@retval false on error +@retval true on success */ +UNIV_INTERN +bool +row_merge_write( +/*============*/ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ + const void* buf, /*!< in: data */ + void* crypt_buf, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Empty a sort buffer. +@return sort buffer */ +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Create a merge file in the given location. +@param[out] merge_file merge file structure +@param[in] path location for creating temporary file, or NULL +@return file descriptor, or -1 on failure */ +pfs_os_file_t +row_merge_file_create( + merge_file_t* merge_file, + const char* path) + MY_ATTRIBUTE((warn_unused_result, nonnull(1))); + +/** Merge disk files. +@param[in] trx transaction +@param[in] dup descriptor of index being created +@param[in,out] file file containing index entries +@param[in,out] block 3 buffers +@param[in,out] tmpfd temporary file handle +@param[in] update_progress true, if we should update progress status +@param[in] pct_progress total progress percent until now +@param[in] pct_ocst current progress percent +@param[in] crypt_block crypt buf or NULL +@param[in] space space_id +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially +and then stage->inc() will be called for each record processed. +@return DB_SUCCESS or error code */ +dberr_t +row_merge_sort( +/*===========*/ + trx_t* trx, + const row_merge_dup_t* dup, + merge_file_t* file, + row_merge_block_t* block, + pfs_os_file_t* tmpfd, + const bool update_progress, + const double pct_progress, + const double pct_cost, + row_merge_block_t* crypt_block, + ulint space, + ut_stage_alter_t* stage = NULL) + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ + MY_ATTRIBUTE((warn_unused_result, nonnull, malloc)); + +/*********************************************************************//** +Deallocate a sort buffer. */ +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Destroy a merge file. */ +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< in/out: merge file structure */ + MY_ATTRIBUTE((nonnull)); + +/** Read a merge block from the file system. +@return whether the request was completed successfully */ +bool +row_merge_read( +/*===========*/ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ + row_merge_block_t* buf, /*!< out: data */ + row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + rec_offs* offsets,/*!< out: offsets of mrec */ + row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ + MY_ATTRIBUTE((warn_unused_result)); +#endif /* row0merge.h */ diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h new file mode 100644 index 00000000..73e96930 --- /dev/null +++ b/storage/innobase/include/row0mysql.h @@ -0,0 +1,975 @@ +/***************************************************************************** + +Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0mysql.h +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#ifndef row0mysql_h +#define row0mysql_h + +#include "que0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "btr0types.h" +#include "lock0types.h" +#include "fil0fil.h" +#include "fts0fts.h" +#include "gis0type.h" + +#include "sql_list.h" +#include "sql_cmd.h" + +extern ibool row_rollback_on_timeout; + +struct row_prebuilt_t; +class ha_innobase; + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */ +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen);/*!< in: storage length of len: either 1 + or 2 bytes */ +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len); /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len); /*!< in: BLOB reference length + (not BLOB length) */ +/*******************************************************************//** +Converts InnoDB geometry data format to MySQL data format. */ +void +row_mysql_store_geometry( +/*=====================*/ + byte* dest, /*!< in/out: where to store */ + ulint dest_len, /*!< in: dest buffer size: determines into + how many bytes the geometry length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const byte* src, /*!< in: geometry data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint src_len); /*!< in: geometry length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/**************************************************************//** +Pad a column with spaces. */ +void +row_mysql_pad_col( +/*==============*/ + ulint mbminlen, /*!< in: minimum size of a character, + in bytes */ + byte* pad, /*!< out: padded buffer */ + ulint len); /*!< in: number of bytes to pad */ + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.cc. +@return up to which byte we used buf in the conversion */ +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! NOTE that dfield + may also get a pointer to 'buf', + therefore do not discard this as long + as dfield is used! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp); /*!< in: nonzero=compact format */ +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return true if it was a lock wait and we should continue running the +query thread */ +bool +row_mysql_handle_errors( +/*====================*/ + dberr_t* new_err,/*!< out: possible new error encountered in + rollback, or the old error which was + during the function entry */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ + MY_ATTRIBUTE((nonnull(1,2))); +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table, /*!< in: Innobase table handle */ + ulint mysql_row_len); /*!< in: length in bytes of a row in + the MySQL format */ +/********************************************************************//** +Free a prebuilt struct for a MySQL table handle. */ +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */ + ibool dict_locked); /*!< in: TRUE=data dictionary locked */ +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx); /*!< in: transaction handle */ + +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +dberr_t +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Lock a table. +@param[in,out] prebuilt table handle +@return error code or DB_SUCCESS */ +dberr_t +row_lock_table(row_prebuilt_t* prebuilt); + +/** System Versioning: row_insert_for_mysql() modes */ +enum ins_mode_t { + /* plain row (without versioning) */ + ROW_INS_NORMAL = 0, + /* row_start = TRX_ID, row_end = MAX */ + ROW_INS_VERSIONED, + /* row_end = TRX_ID */ + ROW_INS_HISTORICAL +}; + +/** Does an insert for MySQL. +@param[in] mysql_rec row in the MySQL format +@param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] ins_mode what row type we're inserting +@return error code or DB_SUCCESS*/ +dberr_t +row_insert_for_mysql( + const byte* mysql_rec, + row_prebuilt_t* prebuilt, + ins_mode_t ins_mode) + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/** Does an update or delete of a row for MySQL. +@param[in,out] prebuilt prebuilt struct in MySQL handle +@return error code or DB_SUCCESS */ +dberr_t +row_update_for_mysql( + row_prebuilt_t* prebuilt) + MY_ATTRIBUTE((warn_unused_result)); + +/** This can only be used when the current transaction is at +READ COMMITTED or READ UNCOMMITTED isolation level. +Before calling this function row_search_for_mysql() must have +initialized prebuilt->new_rec_locks to store the information which new +record locks really were set. This function removes a newly set +clustered index record lock under prebuilt->pcur or +prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that +releases the latest clustered index record lock we set. +@param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] has_latches_on_recs TRUE if called so that we have the + latches on the records under pcur + and clust_pcur, and we do not need + to reposition the cursors. */ +void +row_unlock_for_mysql( + row_prebuilt_t* prebuilt, + ibool has_latches_on_recs); + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap); /*!< in: mem heap from which allocated */ + +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + unsigned line); /*!< in: line number */ +#define row_mysql_lock_data_dictionary(trx) \ + row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__) +/*********************************************************************//** +Unlocks the data dictionary exclusive lock. */ +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + unsigned line); /*!< in: line number */ +#define row_mysql_freeze_data_dictionary(trx) \ + row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__) +/*********************************************************************//** +Unlocks the data dictionary shared lock. */ +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a table for MySQL. On failure the transaction will be rolled back +and the 'table' object will be freed. +@return error code or DB_SUCCESS */ +dberr_t +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + fil_encryption_t mode, /*!< in: encryption mode */ + uint32_t key_id) /*!< in: encryption key_id */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Create an index when creating a table. +On failure, the caller must drop the table! +@return error number or DB_SUCCESS */ +dberr_t +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths) /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +The master thread in srv0srv.cc calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. +@return how many tables dropped + remaining tables in list */ +ulint +row_drop_tables_for_mysql_in_background(void); +/*=========================================*/ +/*********************************************************************//** +Get the background drop list length. NOTE: the caller must own the kernel +mutex! +@return how many tables in list */ +ulint +row_get_background_drop_list_len_low(void); +/*======================================*/ + +/** Drop garbage tables during recovery. */ +void +row_mysql_drop_garbage_tables(); + +/*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Drop a table. +If the data dictionary was not already locked by the transaction, +the transaction will be committed. Otherwise, the data dictionary +will remain locked. +@param[in] name Table name +@param[in,out] trx Transaction handle +@param[in] sqlcom type of SQL operation +@param[in] create_failed true=create table failed + because e.g. foreign key column +@param[in] nonatomic Whether it is permitted to release + and reacquire dict_sys.latch +@return error code */ +dberr_t +row_drop_table_for_mysql( + const char* name, + trx_t* trx, + enum_sql_command sqlcom, + bool create_failed = false, + bool nonatomic = true); + +/** Drop a table after failed CREATE TABLE. */ +dberr_t row_drop_table_after_create_fail(const char* name, trx_t* trx); + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the file_unreadable flag is set. +@return error code or DB_SUCCESS */ +dberr_t +row_discard_tablespace_for_mysql( +/*=============================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +dberr_t +row_import_tablespace_for_mysql( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Drop a database for MySQL. +@param[in] name database name which ends at '/' +@param[in] trx transaction handle +@param[out] found number of dropped tables/partitions +@return error code or DB_SUCCESS */ +dberr_t +row_drop_database_for_mysql( + const char* name, + trx_t* trx, + ulint* found); + +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +dberr_t +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in/out: transaction */ + bool commit, /*!< in: whether to commit trx */ + bool use_fk) /*!< in: whether to parse and enforce + FOREIGN KEY constraints */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Scans an index for either COOUNT(*) or CHECK TABLE. +If CHECK TABLE; Checks that the index contains entries in an ascending order, +unique constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. +@return DB_SUCCESS or other error */ +dberr_t +row_scan_index_for_mysql( +/*=====================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct + in MySQL handle */ + const dict_index_t* index, /*!< in: index */ + ulint* n_rows) /*!< out: number of entries + seen in the consistent read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Initialize this module */ +void +row_mysql_init(void); +/*================*/ + +/*********************************************************************//** +Close this module */ +void +row_mysql_close(void); +/*=================*/ + +/* A struct describing a place for an individual column in the MySQL +row format which is presented to the table handler in ha_innobase. +This template struct is used to speed up row transformations between +Innobase and MySQL. */ + +struct mysql_row_templ_t { + ulint col_no; /*!< column number of the column */ + ulint rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ibool rec_field_is_prefix; /* is this field in a prefix index? */ + ulint rec_prefix_field_no; /* record field, even if just a + prefix; same as rec_field_no when not a + prefix, otherwise rec_field_no is + ULINT_UNDEFINED but this is the true + field number*/ + ulint clust_rec_field_no; /*!< field number of the column in an + Innobase record in the clustered index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint icp_rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined unless + index condition pushdown is used */ + ulint mysql_col_offset; /*!< offset of the column in the MySQL + row format */ + ulint mysql_col_len; /*!< length of the column in the MySQL + row format */ + ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a + MySQL record */ + ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit, + zero if column cannot be NULL */ + ulint type; /*!< column type in Innobase mtype + numbers DATA_CHAR... */ + ulint mysql_type; /*!< MySQL type code; this is always + < 256 */ + ulint mysql_length_bytes; /*!< if mysql_type + == DATA_MYSQL_TRUE_VARCHAR, this tells + whether we should use 1 or 2 bytes to + store the MySQL true VARCHAR data + length at the start of row in the MySQL + format (NOTE that the MySQL key value + format always uses 2 bytes for the data + len) */ + ulint charset; /*!< MySQL charset-collation code + of the column, or zero */ + ulint mbminlen; /*!< minimum length of a char, in bytes, + or zero if not a char type */ + ulint mbmaxlen; /*!< maximum length of a char, in bytes, + or zero if not a char type */ + ulint is_unsigned; /*!< if a column type is an integer + type and this field is != 0, then + it is an unsigned integer type */ + ulint is_virtual; /*!< if a column is a virtual column */ +}; + +#define MYSQL_FETCH_CACHE_SIZE 8 +/* After fetching this many rows, we start caching them in fetch_cache */ +#define MYSQL_FETCH_CACHE_THRESHOLD 4 + +#define ROW_PREBUILT_ALLOCATED 78540783 +#define ROW_PREBUILT_FREED 26423527 + +/** A struct for (sometimes lazily) prebuilt structures in an Innobase table +handle used within MySQL; these are used to save CPU time. */ + +struct row_prebuilt_t { + ulint magic_n; /*!< this magic number is set to + ROW_PREBUILT_ALLOCATED when created, + or ROW_PREBUILT_FREED when the + struct has been freed */ + dict_table_t* table; /*!< Innobase table handle */ + dict_index_t* index; /*!< current index for a search, if + any */ + trx_t* trx; /*!< current transaction handle */ + unsigned sql_stat_start:1;/*!< TRUE when we start processing of + an SQL statement: we may have to set + an intention lock on the table, + create a consistent read view etc. */ + unsigned clust_index_was_generated:1; + /*!< if the user did not define a + primary key in MySQL, then Innobase + automatically generated a clustered + index where the ordering column is + the row id: in this case this flag + is set to TRUE */ + unsigned index_usable:1; /*!< caches the value of + row_merge_is_index_usable(trx,index) */ + unsigned read_just_key:1;/*!< set to 1 when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this + handle in a MySQL HANDLER low level + index cursor command: then we must + store the pcur position even in a + unique search from a clustered index, + because HANDLER allows NEXT and PREV + in such a situation */ + unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW, + ROW_MYSQL_REC_FIELDS, + ROW_MYSQL_DUMMY_TEMPLATE, or + ROW_MYSQL_NO_TEMPLATE */ + unsigned n_template:10; /*!< number of elements in the + template */ + unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL + bitmap at the start of a row in the + MySQL format */ + unsigned need_to_access_clustered:1; /*!< if we are fetching + columns through a secondary index + and at least one column is not in + the secondary index, then this is + set to TRUE; note that sometimes this + is set but we later optimize out the + clustered index lookup */ + unsigned templ_contains_blob:1;/*!< TRUE if the template contains + a column with DATA_LARGE_MTYPE( + get_innobase_type_from_mysql_type()) + is TRUE; + not to be confused with InnoDB + externally stored columns + (VARCHAR can be off-page too) */ + unsigned versioned_write:1;/*!< whether this is + a versioned write */ + mysql_row_templ_t* mysql_template;/*!< template used to transform + rows fast between MySQL and Innobase + formats; memory for this template + is not allocated from 'heap' */ + mem_heap_t* heap; /*!< memory heap from which + these auxiliary structures are + allocated when needed */ + ins_node_t* ins_node; /*!< Innobase SQL insert node + used to perform inserts + to the table */ + byte* ins_upd_rec_buff;/*!< buffer for storing data converted + to the Innobase format from the MySQL + format */ + const byte* default_rec; /*!< the default values of all columns + (a "default row") in MySQL format */ + ulint hint_need_to_fetch_extra_cols; + /*!< normally this is set to 0; if this + is set to ROW_RETRIEVE_PRIMARY_KEY, + then we should at least retrieve all + columns in the primary key; if this + is set to ROW_RETRIEVE_ALL_COLS, then + we must retrieve all columns in the + key (if read_just_key == 1), or all + columns in the table */ + upd_node_t* upd_node; /*!< Innobase SQL update node used + to perform updates and deletes */ + trx_id_t trx_id; /*!< The table->def_trx_id when + ins_graph was built */ + que_fork_t* ins_graph; /*!< Innobase SQL query graph used + in inserts. Will be rebuilt on + trx_id or n_indexes mismatch. */ + que_fork_t* upd_graph; /*!< Innobase SQL query graph used + in updates or deletes */ + btr_pcur_t* pcur; /*!< persistent cursor used in selects + and updates */ + btr_pcur_t* clust_pcur; /*!< persistent cursor used in + some selects and updates */ + que_fork_t* sel_graph; /*!< dummy query graph used in + selects */ + dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */ + byte row_id[DATA_ROW_ID_LEN]; + /*!< if the clustered index was + generated, the row id of the + last row fetched is stored + here */ + doc_id_t fts_doc_id; /* if the table has an FTS index on + it then we fetch the doc_id. + FTS-FIXME: Currently we fetch it always + but in the future we must only fetch + it when FTS columns are being + updated */ + dtuple_t* clust_ref; /*!< prebuilt dtuple used in + sel/upd/del */ + lock_mode select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */ + lock_mode stored_select_lock_type;/*!< this field is used to + remember the original select_lock_type + that was decided in ha_innodb.cc, + ::store_lock(), ::external_lock(), + etc. */ + ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks + should be the obtained for records + under an UPDATE or DELETE cursor. + At READ UNCOMMITTED or + READ COMMITTED isolation level, + this can be set to + ROW_READ_TRY_SEMI_CONSISTENT, so that + if the row under an UPDATE or DELETE + cursor was locked by another + transaction, InnoDB will resort + to reading the last committed value + ('semi-consistent read'). Then, + this field will be set to + ROW_READ_DID_SEMI_CONSISTENT to + indicate that. If the row does not + match the WHERE condition, MySQL will + invoke handler::unlock_row() to + clear the flag back to + ROW_READ_TRY_SEMI_CONSISTENT and + to simply skip the row. If + the row matches, the next call to + row_search_for_mysql() will lock + the row. + This eliminates lock waits in some + cases; note that this breaks + serializability. */ + ulint new_rec_locks; /*!< normally 0; if + the session is using READ + COMMITTED or READ UNCOMMITTED + isolation level, set in + row_search_for_mysql() if we set a new + record lock on the secondary + or clustered index; this is + used in row_unlock_for_mysql() + when releasing the lock under + the cursor if we determine + after retrieving the row that + it does not need to be locked + ('mini-rollback') */ + ulint mysql_prefix_len;/*!< byte offset of the end of + the last requested column */ + ulint mysql_row_len; /*!< length in bytes of a row in the + MySQL format */ + ulint n_rows_fetched; /*!< number of rows fetched after + positioning the current cursor */ + ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */ + byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE]; + /*!< a cache for fetched rows if we + fetch many rows from the same cursor: + it saves CPU time to fetch them in a + batch; we reserve mysql_row_len + bytes for each such row; these + pointers point 4 bytes past the + allocated mem buf start, because + there is a 4 byte magic number at the + start and at the end */ + bool keep_other_fields_on_keyread; /*!< when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ + ulint fetch_cache_first;/*!< position of the first not yet + fetched row in fetch_cache */ + ulint n_fetch_cached; /*!< number of not yet fetched rows + in fetch_cache */ + mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied + to this heap */ + mem_heap_t* old_vers_heap; /*!< memory heap where a previous + version is built in consistent read */ + bool in_fts_query; /*!< Whether we are in a FTS query */ + bool fts_doc_id_in_read_set; /*!< true if table has externally + defined FTS_DOC_ID coulmn. */ + /*----------------------*/ + ulonglong autoinc_last_value; + /*!< last value of AUTO-INC interval */ + ulonglong autoinc_increment;/*!< The increment step of the auto + increment column. Value must be + greater than or equal to 1. Required to + calculate the next value */ + ulonglong autoinc_offset; /*!< The offset passed to + get_auto_increment() by MySQL. Required + to calculate the next value */ + dberr_t autoinc_error; /*!< The actual error code encountered + while trying to init or read the + autoinc value from the table. We + store it here so that we can return + it to MySQL */ + /*----------------------*/ + + /** Argument of handler_rowid_filter_check(), + or NULL if no PRIMARY KEY filter is pushed */ + ha_innobase* pk_filter; + + /** Argument to handler_index_cond_check(), + or NULL if no index condition pushdown (ICP) is used. */ + ha_innobase* idx_cond; + ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols. + 0 if and only if idx_cond == NULL. */ + /*----------------------*/ + + /*----------------------*/ + rtr_info_t* rtr_info; /*!< R-tree Search Info */ + /*----------------------*/ + + ulint magic_n2; /*!< this should be the same as + magic_n */ + + byte* srch_key_val1; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + byte* srch_key_val2; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + uint srch_key_val_len; /*!< Size of search key */ + /** The MySQL table object */ + TABLE* m_mysql_table; + + /** Get template by dict_table_t::cols[] number */ + const mysql_row_templ_t* get_template_by_col(ulint col) const + { + ut_ad(col < n_template); + ut_ad(mysql_template); + for (ulint i = col; i < n_template; ++i) { + const mysql_row_templ_t* templ = &mysql_template[i]; + if (!templ->is_virtual && templ->col_no == col) { + return templ; + } + } + return NULL; + } +}; + +/** Callback for row_mysql_sys_index_iterate() */ +struct SysIndexCallback { + virtual ~SysIndexCallback() { } + + /** Callback method + @param mtr current mini transaction + @param pcur persistent cursor. */ + virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0; +}; + + +/** Storage for calculating virtual columns */ + +class String; +struct VCOL_STORAGE +{ + TABLE *maria_table; + byte *innobase_record; + byte *maria_record; + String *blob_value_storage; + VCOL_STORAGE(): maria_table(NULL), innobase_record(NULL), + maria_record(NULL), blob_value_storage(NULL) {} +}; + +/** + Allocate a heap and record for calculating virtual fields + Used mainly for virtual fields in indexes + +@param[in] thd MariaDB THD +@param[in] index Index in use +@param[out] heap Heap that holds temporary row +@param[in,out] mysql_table MariaDB table +@param[out] rec Pointer to allocated MariaDB record +@param[out] storage Internal storage for blobs etc + +@return FALSE ok +@return TRUE malloc failure +*/ + +bool innobase_allocate_row_for_vcol( + THD * thd, + dict_index_t* index, + mem_heap_t** heap, + TABLE** table, + VCOL_STORAGE* storage); + +/** Free memory allocated by innobase_allocate_row_for_vcol() */ +void innobase_free_row_for_vcol(VCOL_STORAGE *storage); + +class ib_vcol_row +{ + VCOL_STORAGE storage; +public: + mem_heap_t *heap; + + ib_vcol_row(mem_heap_t *heap) : heap(heap) {} + + byte *record(THD *thd, dict_index_t *index, TABLE **table) + { + if (!storage.innobase_record) + { + bool ok = innobase_allocate_row_for_vcol(thd, index, &heap, table, + &storage); + if (!ok) + return NULL; + } + return storage.innobase_record; + }; + + ~ib_vcol_row() + { + if (heap) + { + if (storage.innobase_record) + innobase_free_row_for_vcol(&storage); + mem_heap_free(heap); + } + } +}; + +/** Report virtual value computation failure in ib::error +@param[in] row the data row +*/ +ATTRIBUTE_COLD +void innobase_report_computed_value_failed(dtuple_t *row); + +/** Get the computed value by supplying the base column values. +@param[in,out] row the data row +@param[in] col virtual column +@param[in] index index on the virtual column +@param[in,out] local_heap heap memory for processing large data etc. +@param[in,out] heap memory heap that copies the actual index row +@param[in] ifield index field +@param[in] thd MySQL thread handle +@param[in,out] mysql_table mysql table object +@param[in] old_table during ALTER TABLE, this is the old table + or NULL. +@param[in] parent_update update vector for the parent row +@param[in] foreign foreign key information +@return the field filled with computed value */ +dfield_t* +innobase_get_computed_value( + dtuple_t* row, + const dict_v_col_t* col, + const dict_index_t* index, + mem_heap_t** local_heap, + mem_heap_t* heap, + const dict_field_t* ifield, + THD* thd, + TABLE* mysql_table, + byte* mysql_rec, + const dict_table_t* old_table, + upd_t* parent_update, + dict_foreign_t* foreign); + +/** Get the computed value by supplying the base column values. +@param[in,out] table the table whose virtual column + template to be built */ +TABLE* innobase_init_vc_templ(dict_table_t* table); + +/** Change dbname and table name in table->vc_templ. +@param[in,out] table the table whose virtual column template +dbname and tbname to be renamed. */ +void +innobase_rename_vc_templ( + dict_table_t* table); + +#define ROW_PREBUILT_FETCH_MAGIC_N 465765687 + +#define ROW_MYSQL_WHOLE_ROW 0 +#define ROW_MYSQL_REC_FIELDS 1 +#define ROW_MYSQL_NO_TEMPLATE 2 +#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in + row_scan_and_check_index */ + +/* Values for hint_need_to_fetch_extra_cols */ +#define ROW_RETRIEVE_PRIMARY_KEY 1 +#define ROW_RETRIEVE_ALL_COLS 2 + +/* Values for row_read_type */ +#define ROW_READ_WITH_LOCKS 0 +#define ROW_READ_TRY_SEMI_CONSISTENT 1 +#define ROW_READ_DID_SEMI_CONSISTENT 2 + +#ifdef UNIV_DEBUG +/** Wait for the background drop list to become empty. */ +void +row_wait_for_background_drop_list_empty(); +#endif /* UNIV_DEBUG */ + +#endif /* row0mysql.h */ diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h new file mode 100644 index 00000000..091d80ad --- /dev/null +++ b/storage/innobase/include/row0purge.h @@ -0,0 +1,268 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0purge.h +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0purge_h +#define row0purge_h + +#include "que0types.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "trx0types.h" +#include "row0types.h" +#include "row0mysql.h" +#include "mysqld.h" +#include <queue> + +class MDL_ticket; +/** Determines if it is possible to remove a secondary index entry. +Removal is possible if the secondary index entry does not refer to any +not delete marked version of a clustered index record where DB_TRX_ID +is newer than the purge view. + +NOTE: This function should only be called by the purge thread, only +while holding a latch on the leaf page of the secondary index entry +(or keeping the buffer pool watch on the page). It is possible that +this function first returns true and then false, if a user transaction +inserts a record that the secondary index entry would refer to. +However, in that case, the user transaction would also re-insert the +secondary index entry after purge has removed it and released the leaf +page latch. +@param[in,out] node row purge node +@param[in] index secondary index +@param[in] entry secondary index entry +@param[in,out] sec_pcur secondary index cursor or NULL + if it is called for purge buffering + operation. +@param[in,out] sec_mtr mini-transaction which holds + secondary index entry or NULL if it is + called for purge buffering operation. +@param[in] is_tree true=pessimistic purge, + false=optimistic (leaf-page only) +@return true if the secondary index record can be purged */ +bool +row_purge_poss_sec( + purge_node_t* node, + dict_index_t* index, + const dtuple_t* entry, + btr_pcur_t* sec_pcur=NULL, + mtr_t* sec_mtr=NULL, + bool is_tree=false); + +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. +@return query thread to run next or NULL */ +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Info required to purge a record */ +struct trx_purge_rec_t +{ + /** Record to purge */ + trx_undo_rec_t *undo_rec; + /** File pointer to undo record */ + roll_ptr_t roll_ptr; +}; + +/* Purge node structure */ + +struct purge_node_t{ + que_common_t common; /*!< node type: QUE_NODE_PURGE */ + /*----------------------*/ + /* Local storage for this graph node */ + roll_ptr_t roll_ptr;/* roll pointer to undo log record */ + + undo_no_t undo_no;/*!< undo number of the record */ + + ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC, + ... */ +private: + /** latest unavailable table ID (do not bother looking up again) */ + table_id_t unavailable_table_id; + /** the latest modification of the table definition identified by + unavailable_table_id, or TRX_ID_MAX */ + trx_id_t def_trx_id; +public: + dict_table_t* table; /*!< table where purge is done */ + + ulint cmpl_info;/* compiler analysis info of an update */ + + upd_t* update; /*!< update vector for a clustered index + record */ + const dtuple_t* ref; /*!< NULL, or row reference to the next row to + handle */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the indexed fields of the row to + handle */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after a successful + purge of a row */ + ibool found_clust;/*!< whether the clustered index record + determined by ref was found in the clustered + index, and we were able to position pcur on + it */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ +#ifdef UNIV_DEBUG + /** whether the operation is in progress */ + bool in_progress; +#endif + trx_id_t trx_id; /*!< trx id for this purging record */ + + /** meta-data lock for the table name */ + MDL_ticket* mdl_ticket; + + /** table id of the previous undo log record */ + table_id_t last_table_id; + + /** purge thread */ + THD* purge_thd; + + /** metadata lock holds for this number of undo log recs */ + int mdl_hold_recs; + + /** Undo recs to purge */ + std::queue<trx_purge_rec_t> undo_recs; + + /** Constructor */ + explicit purge_node_t(que_thr_t* parent) : + common(QUE_NODE_PURGE, parent), + unavailable_table_id(0), + table(NULL), + heap(mem_heap_create(256)), +#ifdef UNIV_DEBUG + in_progress(false), +#endif + mdl_ticket(NULL), + last_table_id(0), + purge_thd(NULL), + mdl_hold_recs(0) + { + } + +#ifdef UNIV_DEBUG + /***********************************************************//** + Validate the persisent cursor. The purge node has two references + to the clustered index record - one via the ref member, and the + other via the persistent cursor. These two references must match + each other if the found_clust flag is set. + @return true if the persistent cursor is consistent with + the ref member.*/ + bool validate_pcur(); +#endif + + /** Determine if a table should be skipped in purge. + @param[in] table_id table identifier + @return whether to skip the table lookup and processing */ + bool is_skipped(table_id_t id) const + { + return id == unavailable_table_id && trx_id <= def_trx_id; + } + + /** Remember that a table should be skipped in purge. + @param[in] id table identifier + @param[in] limit last transaction for which to skip */ + void skip(table_id_t id, trx_id_t limit) + { + DBUG_ASSERT(limit >= trx_id); + unavailable_table_id = id; + def_trx_id = limit; + } + + /** Start processing an undo log record. */ + void start() + { + ut_ad(in_progress); + DBUG_ASSERT(common.type == QUE_NODE_PURGE); + + row= nullptr; + ref= nullptr; + index= nullptr; + update= nullptr; + found_clust= FALSE; + rec_type= ULINT_UNDEFINED; + cmpl_info= ULINT_UNDEFINED; + if (!purge_thd) + purge_thd= current_thd; + } + + + /** Close the existing table and release the MDL for it. */ + void close_table() + { + last_table_id= 0; + if (!table) + { + ut_ad(!mdl_ticket); + return; + } + + innobase_reset_background_thd(purge_thd); + dict_table_close(table, false, false, purge_thd, mdl_ticket); + table= nullptr; + mdl_ticket= nullptr; + } + + + /** Retail mdl for the table id. + @param[in] table_id table id to be processed + @return true if retain mdl */ + bool retain_mdl(table_id_t table_id) + { + ut_ad(table_id); + if (last_table_id == table_id && mdl_hold_recs < 100) + { + ut_ad(table); + mdl_hold_recs++; + return true; + } + + mdl_hold_recs= 0; + close_table(); + return false; + } + + + /** Reset the state at end + @return the query graph parent */ + que_node_t* end() + { + DBUG_ASSERT(common.type == QUE_NODE_PURGE); + close_table(); + ut_ad(undo_recs.empty()); + ut_d(in_progress= false); + purge_thd= nullptr; + mem_heap_empty(heap); + return common.parent; + } +}; + +#endif diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h new file mode 100644 index 00000000..b05b7666 --- /dev/null +++ b/storage/innobase/include/row0quiesce.h @@ -0,0 +1,67 @@ +/***************************************************************************** + +Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.h + +Header file for tablespace quiesce functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0quiesce_h +#define row0quiesce_h + +#include "dict0types.h" + +struct trx_t; + +/** The version number of the export meta-data text file. */ +#define IB_EXPORT_CFG_VERSION_V1 0x1UL + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or errro code. */ +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Cleanup after table quiesce. */ +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + MY_ATTRIBUTE((nonnull)); + +#endif /* row0quiesce_h */ diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h new file mode 100644 index 00000000..b4dab3c2 --- /dev/null +++ b/storage/innobase/include/row0row.h @@ -0,0 +1,432 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.h +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0row_h +#define row0row_h + +#include "que0types.h" +#include "ibuf0ibuf.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "row0types.h" +#include "btr0types.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: record offsets */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/* Flags for row build type. */ +#define ROW_BUILD_NORMAL 0 /*!< build index row */ +#define ROW_BUILD_FOR_PURGE 1 /*!< build row for purge. */ +#define ROW_BUILD_FOR_UNDO 2 /*!< build row for undo. */ +#define ROW_BUILD_FOR_INSERT 3 /*!< build row for insert. */ + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap, /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ + ulint flag) /*!< in: ROW_BUILD_NORMAL, + ROW_BUILD_FOR_PURGE + or ROW_BUILD_FOR_UNDO */ + MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4))); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ + MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4))); +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead; the user + columns in this table should be + the same columns as in index->table */ + const dtuple_t* defaults, + /*!< in: default values of + added, changed columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap); /*!< in: memory heap from which + the memory needed is allocated */ + +/** An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index, with possible indexing on ongoing +addition of new virtual columns. +@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA; +@param[in] index clustered index +@param[in] rec record in the clustered index +@param[in] offsets rec_get_offsets(rec,index) or NULL +@param[in] col_table table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead +@param[in] defaults default values of added, changed columns, or NULL +@param[in] add_v new virtual columns added + along with new indexes +@param[in] col_map mapping of old column + numbers to new ones, or NULL +@param[in] ext cache of externally stored column + prefixes, or NULL +@param[in] heap memory heap from which + the memory needed is allocated +@return own: row built */ +dtuple_t* +row_build_w_add_vcol( + ulint type, + const dict_index_t* index, + const rec_t* rec, + const rec_offs* offsets, + const dict_table_t* col_table, + const dtuple_t* defaults, + const dict_add_v_col_t* add_v, + const ulint* col_map, + row_ext_t** ext, + mem_heap_t* heap); + +/*******************************************************************//** +Converts an index record to a typed data tuple. +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built */ +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in/out: rec_get_offsets(rec) */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Convert a metadata record to a data tuple. +@param[in] rec metadata record +@param[in] index clustered index after instant ALTER TABLE +@param[in] offsets rec_get_offsets(rec) +@param[in,out] heap memory heap for allocations +@param[in] info_bits the info_bits after an update +@param[in] pad whether to pad to index->n_fields */ +dtuple_t* +row_metadata_to_tuple( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits, + bool pad) + MY_ATTRIBUTE((nonnull,warn_unused_result)); + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) + or NULL */ + MY_ATTRIBUTE((nonnull(1,2,3))); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: secondary index record; + must be preserved while ref is used, as we do + not copy field values to heap */ + const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ +/***************************************************************//** +Searches the clustered index record for a row, if we have the row +reference. +@return TRUE if found */ +ibool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +rec_t* +row_get_clust_rec( +/*==============*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Parse the integer data from specified data, which could be +DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 +and the type is not unsigned then we reset the value to 0 +@param[in] data data to read +@param[in] len length of data +@param[in] mtype mtype of data +@param[in] unsigned_type if the data is unsigned +@return the integer value from the data */ +inline +ib_uint64_t +row_parse_int( + const byte* data, + ulint len, + ulint mtype, + bool unsigned_type); + +/** Result of row_search_index_entry */ +enum row_search_result { + ROW_FOUND = 0, /*!< the record was found */ + ROW_NOT_FOUND, /*!< record not found */ + ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or + BTR_DELETE_MARK was specified, the + secondary index leaf page was not in + the buffer pool, and the operation was + enqueued in the insert/delete buffer */ + ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and + row_purge_poss_sec() failed */ +}; + +/***************************************************************//** +Searches an index record. +@return whether the record was found or buffered */ +enum row_search_result +row_search_index_entry( +/*===================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: index entry */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define ROW_COPY_DATA 1 +#define ROW_COPY_POINTERS 2 + +/* The allowed latching order of index records is the following: +(1) a secondary index record -> +(2) the clustered index record -> +(3) rollback segment data for the clustered index record. */ + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Prepare to start a mini-transaction to modify an index. +@param[in,out] mtr mini-transaction +@param[in,out] index possibly secondary index +@param[in] pessimistic whether this is a pessimistic operation */ +inline +void +row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic) +{ + mtr->start(); + + switch (index->table->space_id) { + case IBUF_SPACE_ID: + if (pessimistic + && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) { + ibuf_free_excess_pages(); + } + break; + case SRV_TMP_SPACE_ID: + mtr->set_log_mode(MTR_LOG_NO_REDO); + break; + default: + index->set_modified(*mtr); + break; + } + + log_free_check(); +} + +#include "row0row.ic" + +#endif diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic new file mode 100644 index 00000000..e89adb58 --- /dev/null +++ b/storage/innobase/include/row0row.ic @@ -0,0 +1,221 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.ic +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "rem0rec.h" +#include "trx0undo.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: record offsets */ +{ + ulint offset; + ulint len; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + + offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_trx_id(rec + offset)); +} + +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); +} + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ +{ + dtuple_t* entry; + + ut_ad(dtuple_check_typed(row)); + entry = row_build_index_entry_low(row, ext, index, heap, + ROW_BUILD_NORMAL); + ut_ad(!entry || dtuple_check_typed(entry)); + return(entry); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: secondary index record; + must be preserved while ref is used, as we do + not copy field values to heap */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint field_no; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dtuple_get_n_fields(ref); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + field_no = *(map + i); + + if (field_no != ULINT_UNDEFINED) { + + field = rec_get_nth_field(rec, offsets, + field_no, &len); + dfield_set_data(dfield, field, len); + } + } +} + +/** Parse the integer data from specified data, which could be +DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 +and the type is not unsigned then we reset the value to 0 +@param[in] data data to read +@param[in] len length of data +@param[in] mtype mtype of data +@param[in] unsigned_type if the data is unsigned +@return the integer value from the data */ +ib_uint64_t +row_parse_int( + const byte* data, + ulint len, + ulint mtype, + bool unsigned_type) +{ + ib_uint64_t value = 0; + + switch (mtype) { + case DATA_INT: + + ut_a(len <= sizeof value); + value = mach_read_int_type(data, len, unsigned_type); + break; + + case DATA_FLOAT: + + ut_a(len == sizeof(float)); + value = static_cast<ib_uint64_t>(mach_float_read(data)); + break; + + case DATA_DOUBLE: + + ut_a(len == sizeof(double)); + value = static_cast<ib_uint64_t>(mach_double_read(data)); + break; + + default: + ut_error; + + } + + if (!unsigned_type && static_cast<int64_t>(value) < 0) { + value = 0; + } + + return(value); +} + diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h new file mode 100644 index 00000000..60107712 --- /dev/null +++ b/storage/innobase/include/row0sel.h @@ -0,0 +1,482 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.h +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0sel_h +#define row0sel_h + +#include "data0data.h" +#include "que0types.h" +#include "trx0types.h" +#include "read0types.h" +#include "row0types.h" +#include "que0types.h" +#include "pars0sym.h" +#include "btr0pcur.h" +#include "row0mysql.h" + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: memory heap where created */ +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +void +sel_node_free_private( +/*==================*/ + sel_node_t* node); /*!< in: select node struct */ +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */ +/*********************************************************************//** +Gets the plan node for the nth table in a join. +@return plan node */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, /*!< in: select node */ + ulint i); /*!< in: get ith plan node */ +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ + +/** Copy used fields from cached row. +Copy cache record field by field, don't touch fields that +are not covered by current key. +@param[out] buf Where to copy the MySQL row. +@param[in] cached_rec What to copy (in MySQL row format). +@param[in] prebuilt prebuilt struct. */ +void +row_sel_copy_cached_fields_for_mysql( + byte* buf, + const byte* cached_rec, + row_prebuilt_t* prebuilt); + +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions; NOTE that dtuple->data + may end up pointing inside buf so + do not discard that buffer while + the tuple is being used. See + row_mysql_store_col_in_innobase_format() + in the case of DATA_INT */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len); /*!< in: MySQL key value length */ + + +/** Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! + +@param[out] buf buffer for the fetched row in MySQL format +@param[in] mode search mode PAGE_CUR_L +@param[in,out] prebuilt prebuilt struct for the table handler; + this contains the info to search_tuple, + index; if search tuple contains 0 field then + we position the cursor at start or the end of + index, depending on 'mode' +@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX +@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; + Note: if this is != 0, then prebuilt must has a + pcur with stored position! In opening of a + cursor 'direction' should be 0. +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ +UNIV_INLINE +dberr_t +row_search_for_mysql( + byte* buf, + page_cur_mode_t mode, + row_prebuilt_t* prebuilt, + ulint match_mode, + ulint direction) + MY_ATTRIBUTE((warn_unused_result)); + +/** Searches for rows in the database using cursor. +Function is mainly used for tables that are shared across connections and +so it employs technique that can help re-construct the rows that +transaction is suppose to see. +It also has optimization such as pre-caching the rows, using AHI, etc. + +@param[out] buf buffer for the fetched row in MySQL format +@param[in] mode search mode PAGE_CUR_L +@param[in,out] prebuilt prebuilt struct for the table handler; + this contains the info to search_tuple, + index; if search tuple contains 0 field then + we position the cursor at start or the end of + index, depending on 'mode' +@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX +@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; + Note: if this is != 0, then prebuilt must has a + pcur with stored position! In opening of a + cursor 'direction' should be 0. +@return DB_SUCCESS or error code */ +dberr_t +row_search_mvcc( + byte* buf, + page_cur_mode_t mode, + row_prebuilt_t* prebuilt, + ulint match_mode, + ulint direction) + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Count rows in a R-Tree leaf level. +@return DB_SUCCESS if successful */ +dberr_t +row_count_rtree_recs( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint* n_rows); /*!< out: number of entries + seen in the consistent read */ + +/** Read the max AUTOINC value from an index. +@param[in] index index starting with an AUTO_INCREMENT column +@return the largest AUTO_INCREMENT value +@retval 0 if no records were found */ +ib_uint64_t +row_search_max_autoinc(dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** A structure for caching column values for prefetched rows */ +struct sel_buf_t{ + byte* data; /*!< data, or NULL; if not NULL, this field + has allocated memory which must be explicitly + freed; can be != NULL even when len is + UNIV_SQL_NULL */ + ulint len; /*!< data length or UNIV_SQL_NULL */ + ulint val_buf_size; + /*!< size of memory buffer allocated for data: + this can be more than len; this is defined + when data != NULL */ +}; + +/** Copy used fields from cached row. +Copy cache record field by field, don't touch fields that +are not covered by current key. +@param[out] buf Where to copy the MySQL row. +@param[in] cached_rec What to copy (in MySQL row format). +@param[in] prebuilt prebuilt struct. */ +void +row_sel_copy_cached_fields_for_mysql( + byte* buf, + const byte* cached_rec, + row_prebuilt_t* prebuilt); + +/** Query plan */ +struct plan_t{ + dict_table_t* table; /*!< table struct in the dictionary + cache */ + dict_index_t* index; /*!< table index used in the search */ + btr_pcur_t pcur; /*!< persistent cursor used to search + the index */ + ibool asc; /*!< TRUE if cursor traveling upwards */ + ibool pcur_is_open; /*!< TRUE if pcur has been positioned + and we can try to fetch new rows */ + ibool cursor_at_end; /*!< TRUE if the cursor is open but + we know that there are no more + qualifying rows left to retrieve from + the index tree; NOTE though, that + there may still be unprocessed rows in + the prefetch stack; always FALSE when + pcur_is_open is FALSE */ + ibool stored_cursor_rec_processed; + /*!< TRUE if the pcur position has been + stored and the record it is positioned + on has already been processed */ + que_node_t** tuple_exps; /*!< array of expressions + which are used to calculate + the field values in the search + tuple: there is one expression + for each field in the search + tuple */ + dtuple_t* tuple; /*!< search tuple */ + page_cur_mode_t mode; /*!< search mode: PAGE_CUR_G, ... */ + ulint n_exact_match; /*!< number of first fields in + the search tuple which must be + exactly matched */ + ibool unique_search; /*!< TRUE if we are searching an + index record with a unique key */ + ulint n_rows_fetched; /*!< number of rows fetched using pcur + after it was opened */ + ulint n_rows_prefetched;/*!< number of prefetched rows cached + for fetch: fetching several rows in + the same mtr saves CPU time */ + ulint first_prefetched;/*!< index of the first cached row in + select buffer arrays for each column */ + ibool no_prefetch; /*!< no prefetch for this table */ + sym_node_list_t columns; /*!< symbol table nodes for the columns + to retrieve from the table */ + UT_LIST_BASE_NODE_T(func_node_t) + end_conds; /*!< conditions which determine the + fetch limit of the index segment we + have to look at: when one of these + fails, the result set has been + exhausted for the cursor in this + index; these conditions are normalized + so that in a comparison the column + for this table is the first argument */ + UT_LIST_BASE_NODE_T(func_node_t) + other_conds; /*!< the rest of search conditions we can + test at this table in a join */ + ibool must_get_clust; /*!< TRUE if index is a non-clustered + index and we must also fetch the + clustered index record; this is the + case if the non-clustered record does + not contain all the needed columns, or + if this is a single-table explicit + cursor, or a searched update or + delete */ + ulint* clust_map; /*!< map telling how clust_ref is built + from the fields of a non-clustered + record */ + dtuple_t* clust_ref; /*!< the reference to the clustered + index entry is built here if index is + a non-clustered index */ + btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use + this pcur to search the clustered + index */ + mem_heap_t* old_vers_heap; /*!< memory heap used in building an old + version of a row, or NULL */ +}; + +/** Select node states */ +enum sel_node_state { + SEL_NODE_CLOSED, /*!< it is a declared cursor which is not + currently open */ + SEL_NODE_OPEN, /*!< intention locks not yet set on tables */ + SEL_NODE_FETCH, /*!< intention locks have been set */ + SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */ +}; + +/** Select statement node */ +struct sel_node_t{ + que_common_t common; /*!< node type: QUE_NODE_SELECT */ + enum sel_node_state + state; /*!< node state */ + que_node_t* select_list; /*!< select list */ + sym_node_t* into_list; /*!< variables list or NULL */ + sym_node_t* table_list; /*!< table list */ + ibool asc; /*!< TRUE if the rows should be fetched + in an ascending order */ + ibool set_x_locks; /*!< TRUE if the cursor is for update or + delete, which means that a row x-lock + should be placed on the cursor row */ + lock_mode row_lock_mode; /*!< LOCK_X or LOCK_S */ + ulint n_tables; /*!< number of tables */ + ulint fetch_table; /*!< number of the next table to access + in the join */ + plan_t* plans; /*!< array of n_tables many plan nodes + containing the search plan and the + search data structures */ + que_node_t* search_cond; /*!< search condition */ + ReadView* read_view; /*!< if the query is a non-locking + consistent read, its read view is + placed here, otherwise NULL */ + ibool consistent_read;/*!< TRUE if the select is a consistent, + non-locking read */ + order_node_t* order_by; /*!< order by column definition, or + NULL */ + ibool is_aggregate; /*!< TRUE if the select list consists of + aggregate functions */ + ibool aggregate_already_fetched; + /*!< TRUE if the aggregate row has + already been fetched for the current + cursor */ + ibool can_get_updated;/*!< this is TRUE if the select + is in a single-table explicit + cursor which can get updated + within the stored procedure, + or in a searched update or + delete; NOTE that to determine + of an explicit cursor if it + can get updated, the parser + checks from a stored procedure + if it contains positioned + update or delete statements */ + sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */ + UT_LIST_BASE_NODE_T(sym_node_t) + copy_variables; /*!< variables whose values we have to + copy when an explicit cursor is opened, + so that they do not change between + fetches */ +}; + +/** Fetch statement node */ +struct fetch_node_t{ + que_common_t common; /*!< type: QUE_NODE_FETCH */ + sel_node_t* cursor_def; /*!< cursor definition */ + sym_node_t* into_list; /*!< variables to set */ + + pars_user_func_t* + func; /*!< User callback function or NULL. + The first argument to the function + is a sel_node_t*, containing the + results of the SELECT operation for + one row. If the function returns + NULL, it is not interested in + further rows and the cursor is + modified so (cursor % NOTFOUND) is + true. If it returns not-NULL, + continue normally. */ +}; + +/** Open or close cursor operation type */ +enum open_node_op { + ROW_SEL_OPEN_CURSOR, /*!< open cursor */ + ROW_SEL_CLOSE_CURSOR /*!< close cursor */ +}; + +/** Open or close cursor statement node */ +struct open_node_t{ + que_common_t common; /*!< type: QUE_NODE_OPEN */ + enum open_node_op + op_type; /*!< operation type: open or + close cursor */ + sel_node_t* cursor_def; /*!< cursor definition */ +}; + +/** Row printf statement node */ +struct row_printf_node_t{ + que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */ + sel_node_t* sel_node; /*!< select */ +}; + +/** Search direction for the MySQL interface */ +enum row_sel_direction { + ROW_SEL_NEXT = 1, /*!< ascending direction */ + ROW_SEL_PREV = 2 /*!< descending direction */ +}; + +/** Match mode for the MySQL interface */ +enum row_sel_match_mode { + ROW_SEL_EXACT = 1, /*!< search using a complete key value */ + ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which + must match rows: the prefix may + contain an incomplete field (the last + field in prefix may be just a prefix + of a fixed length column) */ +}; + +#ifdef UNIV_DEBUG +/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */ +# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \ + row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len) +#else /* UNIV_DEBUG */ +/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */ +# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \ + row_sel_field_store_in_mysql_format_func(dest,templ,src,len) +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */ + +void +row_sel_field_store_in_mysql_format_func( +/*=====================================*/ + byte* dest, /*!< in/out: buffer where to store; NOTE + that BLOBs are not in themselves + stored here: the caller must allocate + and copy the BLOB into buffer before, + and pass the pointer to the BLOB in + 'data' */ + const mysql_row_templ_t* templ, + /*!< in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, + mbminlen, mbmaxlen */ +#ifdef UNIV_DEBUG + const dict_index_t* index, + /*!< in: InnoDB index */ + ulint field_no, + /*!< in: templ->rec_field_no or + templ->clust_rec_field_no or + templ->icp_rec_field_no */ +#endif /* UNIV_DEBUG */ + const byte* data, /*!< in: data to store */ + ulint len); /*!< in: length of the data */ + +#include "row0sel.ic" + +#endif diff --git a/storage/innobase/include/row0sel.ic b/storage/innobase/include/row0sel.ic new file mode 100644 index 00000000..7880605c --- /dev/null +++ b/storage/innobase/include/row0sel.ic @@ -0,0 +1,138 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.ic +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" + +/*********************************************************************//** +Gets the plan node for the nth table in a join. +@return plan node */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, /*!< in: select node */ + ulint i) /*!< in: get ith plan node */ +{ + ut_ad(i < node->n_tables); + + return(node->plans + i); +} + +/*********************************************************************//** +Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means +that it will start fetching from the start of the result set again, regardless +of where it was before, and it will set intention locks on the tables. */ +UNIV_INLINE +void +sel_node_reset_cursor( +/*==================*/ + sel_node_t* node) /*!< in: select node */ +{ + node->state = SEL_NODE_OPEN; +} + +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* sel_node; + open_node_t* node; + ulint err; + + ut_ad(thr); + + node = (open_node_t*) thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_OPEN); + + sel_node = node->cursor_def; + + err = DB_SUCCESS; + + if (node->op_type == ROW_SEL_OPEN_CURSOR) { + + /* if (sel_node->state == SEL_NODE_CLOSED) { */ + + sel_node_reset_cursor(sel_node); + /* } else { + err = DB_ERROR; + } */ + } else { + if (sel_node->state != SEL_NODE_CLOSED) { + + sel_node->state = SEL_NODE_CLOSED; + } else { + err = DB_ERROR; + } + } + + if (err != DB_SUCCESS) { + /* SQL error detected */ + fprintf(stderr, "SQL error %lu\n", (ulong) err); + + ut_error; + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + + +/** Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! + +@param[out] buf buffer for the fetched row in MySQL format +@param[in] mode search mode PAGE_CUR_L +@param[in,out] prebuilt prebuilt struct for the table handler; + this contains the info to search_tuple, + index; if search tuple contains 0 field then + we position the cursor at start or the end of + index, depending on 'mode' +@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX +@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; + Note: if this is != 0, then prebuilt must has a + pcur with stored position! In opening of a + cursor 'direction' should be 0. +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ +UNIV_INLINE +dberr_t +row_search_for_mysql( + byte* buf, + page_cur_mode_t mode, + row_prebuilt_t* prebuilt, + ulint match_mode, + ulint direction) +{ + return(row_search_mvcc(buf, mode, prebuilt, match_mode, direction)); +} diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h new file mode 100644 index 00000000..5e737c1c --- /dev/null +++ b/storage/innobase/include/row0types.h @@ -0,0 +1,54 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0types.h +Row operation global types + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "buf0types.h" + +struct plan_t; + +struct upd_t; +struct upd_field_t; +struct upd_node_t; +struct del_node_t; +struct ins_node_t; +struct sel_node_t; +struct open_node_t; +struct fetch_node_t; + +struct row_printf_node_t; +struct sel_buf_t; + +struct undo_node_t; + +struct purge_node_t; + +struct row_ext_t; + +/** Buffer for logging modifications during online index creation */ +struct row_log_t; + +/* MySQL data types */ +struct TABLE; diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h new file mode 100644 index 00000000..a9877969 --- /dev/null +++ b/storage/innobase/include/row0uins.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0uins.h +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0uins_h +#define row0uins_h + +#include "data0data.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS */ +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#endif diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h new file mode 100644 index 00000000..5032e103 --- /dev/null +++ b/storage/innobase/include/row0umod.h @@ -0,0 +1,46 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0umod.h +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0umod_h +#define row0umod_h + +#include "data0data.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +dberr_t +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); + +#endif diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h new file mode 100644 index 00000000..4357a908 --- /dev/null +++ b/storage/innobase/include/row0undo.h @@ -0,0 +1,128 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0undo.h +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0undo_h +#define row0undo_h + +#include "trx0sys.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "que0types.h" +#include "row0types.h" + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return true if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +bool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node) /*!< in/out: row undo node */ + MY_ATTRIBUTE((warn_unused_result)); +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* A single query thread will try to perform the undo for all successive +versions of a clustered index record, if the transaction has modified it +several times during the execution which is rolled back. It may happen +that the task is transferred to another query thread, if the other thread +is assigned to handle an undo log record in the chain of different versions +of the record, and the other thread happens to get the x-latch to the +clustered index record at the right time. + If a query thread notices that the clustered index record it is looking +for is missing, or the roll ptr field in the record doed not point to the +undo log record the thread was assigned to handle, then it gives up the undo +task for that undo log record, and fetches the next. This situation can occur +just in the case where the transaction modified the same record several times +and another thread is currently doing the undo for successive versions of +that index record. */ + +/** Execution state of an undo node */ +enum undo_exec { + UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next + undo log record */ + /** rollback an insert into persistent table */ + UNDO_INSERT_PERSISTENT, + /** rollback an update (or delete) in a persistent table */ + UNDO_UPDATE_PERSISTENT, + /** rollback an insert into temporary table */ + UNDO_INSERT_TEMPORARY, + /** rollback an update (or delete) in a temporary table */ + UNDO_UPDATE_TEMPORARY, +}; + +/** Undo node structure */ +struct undo_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UNDO */ + undo_exec state; /*!< rollback execution state */ + trx_t* trx; /*!< trx for which undo is done */ + roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */ + trx_undo_rec_t* undo_rec;/*!< undo log record */ + undo_no_t undo_no;/*!< undo number of the record */ + ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC, + ... */ + trx_id_t new_trx_id; /*!< trx id to restore to clustered index + record */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ + dict_table_t* table; /*!< table where undo is done */ + ulint cmpl_info;/*!< compiler analysis of an update */ + upd_t* update; /*!< update vector for a clustered index + record */ + const dtuple_t* ref; /*!< row reference to the next row to handle */ + dtuple_t* row; /*!< a copy (also fields copied to heap) of the + row to handle */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns of the row */ + dtuple_t* undo_row;/*!< NULL, or the row after undo */ + row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally + stored columns of undo_row */ + dict_index_t* index; /*!< the next index whose record should be + handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after undo is tried + on a row */ +}; + +#endif diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h new file mode 100644 index 00000000..58c60a0a --- /dev/null +++ b/storage/innobase/include/row0upd.h @@ -0,0 +1,568 @@ +/***************************************************************************** + +Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.h +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0upd_h +#define row0upd_h + +#include "data0data.h" +#include "rem0types.h" +#include "row0types.h" +#include "btr0types.h" +#include "trx0types.h" +#include "btr0pcur.h" +#include "que0types.h" +#include "pars0types.h" + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap); /*!< in: heap from which memory allocated */ +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update); /*!< in: update vector */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n); /*!< in: field position in update vector */ +#else +# define upd_get_nth_field(update, n) ((update)->fields + (n)) +#endif + +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + uint16_t field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index); + +/** set field number to a update vector field, marks this field is updated +@param[in,out] upd_field update vector field +@param[in] field_no virtual column sequence num +@param[in] index index */ +UNIV_INLINE +void +upd_field_set_v_field_no( + upd_field_t* upd_field, + uint16_t field_no, + dict_index_t* index); +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + uint16_t no, /*!< in: field_no */ + bool is_virtual) /*!< in: if it is a virtual column */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update);/*!< in: update vector */ +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const dtuple_t* entry, /*!< in: entry to insert */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/** Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@param[in] index clustered index +@param[in] entry clustered index entry to insert +@param[in] rec clustered index record +@param[in] offsets rec_get_offsets(rec,index), or NULL +@param[in] no_sys skip the system columns + DB_TRX_ID and DB_ROLL_PTR +@param[in] trx transaction (for diagnostics), + or NULL +@param[in] heap memory heap from which allocated +@param[in,out] mysql_table NULL, or mysql table object when + user thread invokes dml +@param[out] error error number in case of failure +@return own: update vector of differing fields, excluding roll ptr and +trx id */ +upd_t* +row_upd_build_difference_binary( + dict_index_t* index, + const dtuple_t* entry, + const rec_t* rec, + const rec_offs* offsets, + bool no_sys, + trx_t* trx, + mem_heap_t* heap, + TABLE* mysql_table, + dberr_t* error) + MY_ATTRIBUTE((nonnull(1,2,3,7,9), warn_unused_result)); +/** Apply an update vector to an index entry. +@param[in,out] entry index entry to be updated; the clustered index record + must be covered by a lock or a page latch to prevent + deletion (rollback or purge) +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ +void +row_upd_index_replace_new_col_vals_index_pos( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) + MY_ATTRIBUTE((nonnull)); +/** Replace the new column values stored in the update vector, +during trx_undo_prev_version_build(). +@param entry clustered index tuple where the values are replaced + (the clustered index leaf page latch must be held) +@param index clustered index +@param update update vector for the clustered index +@param heap memory heap for allocating and copying values +@return whether the previous version was built successfully */ +bool +row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index, + const upd_t *update, mem_heap_t *heap) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap); /*!< in: memory heap */ +/** Replaces the virtual column values stored in a dtuple with that of +a update vector. +@param[in,out] row dtuple whose column to be updated +@param[in] table table +@param[in] update an update vector built for the clustered index +@param[in] upd_new update to new or old value +@param[in,out] undo_row undo row (if needs to be updated) +@param[in] ptr remaining part in update undo log */ +void +row_upd_replace_vcol( + dtuple_t* row, + const dict_table_t* table, + const upd_t* update, + bool upd_new, + dtuple_t* undo_row, + const byte* ptr); + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +ibool +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + const row_ext_t*ext, /*!< NULL, or prefixes of the externally + stored columns in the old row */ + ulint flag) /*!< in: ROW_BUILD_NORMAL, + ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */ + MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); +#ifdef UNIV_DEBUG +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,thr,row,ext,0) +#else /* UNIV_DEBUG */ +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,row,ext,0) +#endif /* UNIV_DEBUG */ +/***********************************************************//** +Checks if an FTS indexed column is affected by an UPDATE. +@return offset within fts_t::indexes if FTS indexed column updated else +ULINT_UNDEFINED */ +ulint +row_upd_changes_fts_column( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field); /*!< in: field to check */ +/***********************************************************//** +Checks if an FTS Doc ID column is affected by an UPDATE. +@return whether Doc ID column is affected */ +bool +row_upd_changes_doc_id( +/*===================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update);/*!< in: update vector for the row */ +/***********************************************************//** +Updates a row in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* Update vector field */ +struct upd_field_t{ + uint16_t field_no; /*!< field number in an index, usually + the clustered index, but in updating + a secondary index record in btr0cur.cc + this is the position in the secondary + index. If this field is a virtual + column, then field_no represents + the nth virtual column in the table */ + uint16_t orig_len; /*!< original length of the locally + stored part of an externally stored + column, or 0 */ + que_node_t* exp; /*!< expression for calculating a new + value: it refers to column values and + constants in the symbol table of the + query graph */ + dfield_t new_val; /*!< new value for the column */ + dfield_t* old_v_val; /*!< old value for the virtual column */ +}; + + +/* check whether an update field is on virtual column */ +#define upd_fld_is_virtual_col(upd_fld) \ + (((upd_fld)->new_val.type.prtype & DATA_VIRTUAL) == DATA_VIRTUAL) + +/* set DATA_VIRTUAL bit on update field to show it is a virtual column */ +#define upd_fld_set_virtual_col(upd_fld) \ + ((upd_fld)->new_val.type.prtype |= DATA_VIRTUAL) + +/* Update vector structure */ +struct upd_t{ + mem_heap_t* heap; /*!< heap from which memory allocated */ + byte info_bits; /*!< new value of info bits to record; + default is 0 */ + dtuple_t* old_vrow; /*!< pointer to old row, used for + virtual column update now */ + ulint n_fields; /*!< number of update fields */ + upd_field_t* fields; /*!< array of update fields */ + byte vers_sys_value[8]; /*!< buffer for updating system fields */ + + /** Append an update field to the end of array + @param[in] field an update field */ + void append(const upd_field_t& field) + { + fields[n_fields++] = field; + } + + void remove_element(ulint i) + { + ut_ad(n_fields > 0); + ut_ad(i < n_fields); + while (i < n_fields - 1) + { + fields[i]= fields[i + 1]; + i++; + } + n_fields--; + } + + bool remove(const ulint field_no) + { + for (ulint i= 0; i < n_fields; ++i) + { + if (field_no == fields[i].field_no) + { + remove_element(i); + return true; + } + } + return false; + } + + /** Determine if the given field_no is modified. + @return true if modified, false otherwise. */ + bool is_modified(uint16_t field_no) const + { + for (ulint i = 0; i < n_fields; ++i) { + if (field_no == fields[i].field_no) { + return(true); + } + } + return(false); + } + + /** Determine if the update affects a system versioned column or row_end. */ + bool affects_versioned() const + { + for (ulint i = 0; i < n_fields; i++) { + dtype_t type = fields[i].new_val.type; + if (type.is_versioned()) { + return true; + } + // versioned DELETE is UPDATE SET row_end=NOW + if (type.vers_sys_end()) { + return true; + } + } + return false; + } + + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE */ + bool is_metadata() const { return dtuple_t::is_metadata(info_bits); } + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const + { return dtuple_t::is_alter_metadata(info_bits); } + +#ifdef UNIV_DEBUG + bool validate() const + { + for (ulint i = 0; i < n_fields; ++i) { + dfield_t* field = &fields[i].new_val; + if (dfield_is_ext(field)) { + ut_ad(dfield_get_len(field) + >= BTR_EXTERN_FIELD_REF_SIZE); + } + } + return(true); + } +#endif // UNIV_DEBUG +}; + +/** Kinds of update operation */ +enum delete_mode_t { + NO_DELETE = 0, /*!< this operation does not delete */ + PLAIN_DELETE, /*!< ordinary delete */ + VERSIONED_DELETE /*!< update old and insert a new row */ +}; + +/* Update node structure which also implements the delete operation +of a row */ + +struct upd_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UPDATE */ + delete_mode_t is_delete; /*!< kind of DELETE */ + ibool searched_update; + /* TRUE if searched update, FALSE if + positioned */ + bool in_mysql_interface; + /* whether the update node was created + for the MySQL interface */ + dict_foreign_t* foreign;/* NULL or pointer to a foreign key + constraint if this update node is used in + doing an ON DELETE or ON UPDATE operation */ + upd_node_t* cascade_node;/* NULL or an update node template which + is used to implement ON DELETE/UPDATE CASCADE + or ... SET NULL for foreign keys */ + mem_heap_t* cascade_heap; + /*!< NULL or a mem heap where cascade + node is created.*/ + sel_node_t* select; /*!< query graph subtree implementing a base + table cursor: the rows returned will be + updated */ + btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered + index record which should be updated or + deleted; the cursor is stored in the graph + of 'select' field above, except in the case + of the MySQL interface */ + dict_table_t* table; /*!< table where updated */ + upd_t* update; /*!< update vector for the row */ + ulint update_n_fields; + /* when this struct is used to implement + a cascade operation for foreign keys, we store + here the size of the buffer allocated for use + as the update vector */ + sym_node_list_t columns;/* symbol table nodes for the columns + to retrieve from the table */ + ibool has_clust_rec_x_lock; + /* TRUE if the select which retrieves the + records to update already sets an x-lock on + the clustered record; note that it must always + set at least an s-lock */ + ulint cmpl_info;/* information extracted during query + compilation; speeds up execution: + UPD_NODE_NO_ORD_CHANGE and + UPD_NODE_NO_SIZE_CHANGE, ORed */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be updated */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the row to update; this must be reset + to NULL after a successful update */ + dtuple_t* historical_row; /*!< historical row used in + CASCADE UPDATE/SET NULL; + allocated from historical_heap */ + mem_heap_t* historical_heap; /*!< heap for historical row insertion; + created when row to update is located; + freed right before row update */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns in the old row */ + dtuple_t* upd_row;/* NULL, or a copy of the updated row */ + row_ext_t* upd_ext;/* NULL, or prefixes of the externally + stored columns in upd_row */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage; + this must be emptied after a successful + update */ + /*----------------------*/ + sym_node_t* table_sym;/* table node in symbol table */ + que_node_t* col_assign_list; + /* column assignment list */ + ulint magic_n; + +private: + /** Appends row_start or row_end field to update vector and sets a + CURRENT_TIMESTAMP/trx->id value to it. + Supposed to be called only by make_versioned_update() and + make_versioned_delete(). + @param[in] trx transaction + @param[in] vers_sys_idx table->row_start or table->row_end */ + void vers_update_fields(const trx_t *trx, ulint idx); + +public: + /** Also set row_start = CURRENT_TIMESTAMP/trx->id + @param[in] trx transaction */ + void vers_make_update(const trx_t *trx) + { + vers_update_fields(trx, table->vers_start); + } + + /** Only set row_end = CURRENT_TIMESTAMP/trx->id. + Do not touch other fields at all. + @param[in] trx transaction */ + void vers_make_delete(const trx_t *trx) + { + update->n_fields = 0; + is_delete = VERSIONED_DELETE; + vers_update_fields(trx, table->vers_end); + } +}; + +#define UPD_NODE_MAGIC_N 1579975 + +/* Node execution states */ +#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from + a node above and if the field + has_clust_rec_x_lock is FALSE, we + should set an intention x-lock on + the table */ +#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be + updated */ +#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be + inserted, old record is already delete + marked */ +#define UPD_NODE_UPDATE_ALL_SEC 5 /* an ordering field of the clustered + index record was changed, or this is + a delete operation: should update + all the secondary index records */ +#define UPD_NODE_UPDATE_SOME_SEC 6 /* secondary index entries should be + looked at and updated if an ordering + field changed */ + +/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */ +#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be + changed in the update and no ordering + field of the clustered index */ +#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be + changed in the update */ + + +#include "row0upd.ic" + +#endif diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic new file mode 100644 index 00000000..13aacf3f --- /dev/null +++ b/storage/innobase/include/row0upd.ic @@ -0,0 +1,153 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.ic +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "row0row.h" +#include "lock0lock.h" +#include "page0zip.h" + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap) /*!< in: heap from which memory allocated */ +{ + upd_t* update; + + update = static_cast<upd_t*>(mem_heap_zalloc( + heap, sizeof(upd_t) + sizeof(upd_field_t) * n)); + + update->n_fields = n; + update->fields = reinterpret_cast<upd_field_t*>(&update[1]); + update->heap = heap; + + return(update); +} + +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update) /*!< in: update vector */ +{ + ut_ad(update); + + return(update->n_fields); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n) /*!< in: field position in update vector */ +{ + ut_ad(update); + ut_ad(n < update->n_fields); + + return((upd_field_t*) update->fields + n); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + uint16_t field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index) /*!< in: index */ +{ + upd_field->field_no = field_no; + upd_field->orig_len = 0; + dict_col_copy_type(dict_index_get_nth_col(index, field_no), + dfield_get_type(&upd_field->new_val)); +} + +/** set field number to a update vector field, marks this field is updated. +@param[in,out] upd_field update vector field +@param[in] field_no virtual column sequence num +@param[in] index index */ +UNIV_INLINE +void +upd_field_set_v_field_no( + upd_field_t* upd_field, + uint16_t field_no, + dict_index_t* index) +{ + ut_a(field_no < dict_table_get_n_v_cols(index->table)); + upd_field->field_no = field_no; + upd_field->orig_len = 0; + + dict_col_copy_type(&dict_table_get_nth_v_col( + index->table, field_no)->m_col, + dfield_get_type(&upd_field->new_val)); +} + +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + uint16_t no, /*!< in: field_no */ + bool is_virtual) /*!< in: if it is virtual column */ +{ + ulint i; + for (i = 0; i < upd_get_n_fields(update); i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + + /* matches only if the field matches that of is_virtual */ + if ((!is_virtual) != (!upd_fld_is_virtual_col(uf))) { + continue; + } + + if (uf->field_no == no) { + + return(uf); + } + } + + return(NULL); +} diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h new file mode 100644 index 00000000..d54384f8 --- /dev/null +++ b/storage/innobase/include/row0vers.h @@ -0,0 +1,141 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0vers.h +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0vers_h +#define row0vers_h + +#include "data0data.h" +#include "trx0types.h" +#include "que0types.h" +#include "rem0types.h" +#include "mtr0mtr.h" +#include "dict0mem.h" +#include "row0types.h" + +// Forward declaration +class ReadView; + +/** Determine if an active transaction has inserted or modified a secondary +index record. +@param[in,out] caller_trx trx of current thread +@param[in] rec secondary index record +@param[in] index secondary index +@param[in] offsets rec_get_offsets(rec, index) +@return the active transaction; state must be rechecked after +trx_mutex_enter(), and trx->release_reference() must be invoked +@retval NULL if the record was committed */ +trx_t* +row_vers_impl_x_locked( + trx_t* caller_trx, + const rec_t* rec, + dict_index_t* index, + const rec_offs* offsets); + +/** Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. +@param[in] also_curr TRUE if also rec is included in the versions + to search; otherwise only versions prior + to it are searched +@param[in] rec record in the clustered index; the caller + must have a latch on the page +@param[in] mtr mtr holding the latch on rec; it will + also hold the latch on purge_view +@param[in] index secondary index +@param[in] ientry secondary index entry +@param[in] roll_ptr roll_ptr for the purge record +@param[in] trx_id transaction ID on the purging record +@return TRUE if earlier version should have */ +bool +row_vers_old_has_index_entry( + bool also_curr, + const rec_t* rec, + mtr_t* mtr, + dict_index_t* index, + const dtuple_t* ientry, + roll_ptr_t roll_ptr, + trx_id_t trx_id); + +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return DB_SUCCESS or DB_MISSING_HISTORY */ +dberr_t +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the clustered index */ + rec_offs** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + ReadView* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers,/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, + it was freshly inserted afterwards */ + dtuple_t** vrow); /*!< out: reports virtual column info if any */ + +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +void +row_vers_build_for_semi_consistent_read( +/*====================================*/ + trx_t* caller_trx,/*!<in/out: trx of current thread */ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + rec_offs** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers,/*!< out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + dtuple_t** vrow); /*!< out: holds virtual column info if any + is updated in the view */ + +#endif diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h new file mode 100644 index 00000000..b50a76fa --- /dev/null +++ b/storage/innobase/include/rw_lock.h @@ -0,0 +1,112 @@ +/***************************************************************************** + +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once +#include <atomic> +#include "my_dbug.h" + +/** Simple read-write lock based on std::atomic */ +class rw_lock +{ + /** The lock word */ + std::atomic<uint32_t> lock; + +protected: + /** Available lock */ + static constexpr uint32_t UNLOCKED= 0; + /** Flag to indicate that write_lock() is being held */ + static constexpr uint32_t WRITER= 1U << 31; + /** Flag to indicate that write_lock_wait() is pending */ + static constexpr uint32_t WRITER_WAITING= 1U << 30; + /** Flag to indicate that write_lock() or write_lock_wait() is pending */ + static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING; + + /** Start waiting for an exclusive lock. */ + void write_lock_wait_start() + { lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); } + /** Try to acquire a shared lock. + @param l the value of the lock word + @return whether the lock was acquired */ + bool read_trylock(uint32_t &l) + { + l= UNLOCKED; + while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire, + std::memory_order_relaxed)) + { + DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l)); + if (l & WRITER_PENDING) + return false; + } + return true; + } + /** Wait for an exclusive lock. + @return whether the exclusive lock was acquired */ + bool write_lock_poll() + { + auto l= WRITER_WAITING; + if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire, + std::memory_order_relaxed)) + return true; + if (!(l & WRITER_WAITING)) + /* write_lock() must have succeeded for another thread */ + write_lock_wait_start(); + return false; + } + +public: + /** Default constructor */ + rw_lock() : lock(UNLOCKED) {} + + /** Release a shared lock */ + void read_unlock() + { + IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release); + DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */ + DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */ + } + /** Release an exclusive lock */ + void write_unlock() + { + IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release); + DBUG_ASSERT(l & WRITER); /* the write lock must have existed */ + } + /** Try to acquire a shared lock. + @return whether the lock was acquired */ + bool read_trylock() { uint32_t l; return read_trylock(l); } + /** Try to acquire an exclusive lock. + @return whether the lock was acquired */ + bool write_trylock() + { + auto l= UNLOCKED; + return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire, + std::memory_order_relaxed); + } + + /** @return whether an exclusive lock is being held by any thread */ + bool is_write_locked() const + { return !!(lock.load(std::memory_order_relaxed) & WRITER); } + /** @return whether a shared lock is being held by any thread */ + bool is_read_locked() const + { + auto l= lock.load(std::memory_order_relaxed); + return (l & ~WRITER_PENDING) && !(l & WRITER); + } + /** @return whether any lock is being held by any thread */ + bool is_locked() const + { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; } +}; diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h new file mode 100644 index 00000000..cbda9d06 --- /dev/null +++ b/storage/innobase/include/srv0mon.h @@ -0,0 +1,892 @@ +/*********************************************************************** + +Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/srv0mon.h +Server monitor counter related defines + +Created 12/15/2009 Jimmy Yang +*******************************************************/ + +#ifndef srv0mon_h +#define srv0mon_h + +#include "univ.i" + +#ifndef __STDC_LIMIT_MACROS +/* Required for FreeBSD so that INT64_MAX is defined. */ +#define __STDC_LIMIT_MACROS +#endif /* __STDC_LIMIT_MACROS */ + +#include <stdint.h> +#include "my_atomic.h" +#include "my_atomic_wrapper.h" + +/** Possible status values for "mon_status" in "struct monitor_value" */ +enum monitor_running_status { + MONITOR_STARTED = 1, /*!< Monitor has been turned on */ + MONITOR_STOPPED = 2 /*!< Monitor has been turned off */ +}; + +typedef enum monitor_running_status monitor_running_t; + +/** Monitor counter value type */ +typedef int64_t mon_type_t; + +/** Two monitor structures are defined in this file. One is +"monitor_value_t" which contains dynamic counter values for each +counter. The other is "monitor_info_t", which contains +static information (counter name, desc etc.) for each counter. +In addition, an enum datatype "monitor_id_t" is also defined, +it identifies each monitor with an internally used symbol, whose +integer value indexes into above two structure for its dynamic +and static information. +Developer who intend to add new counters would require to +fill in counter information as described in "monitor_info_t" and +create the internal counter ID in "monitor_id_t". */ + +/** Structure containing the actual values of a monitor counter. */ +struct monitor_value_t { + time_t mon_start_time; /*!< Start time of monitoring */ + time_t mon_stop_time; /*!< Stop time of monitoring */ + time_t mon_reset_time; /*!< Time of resetting the counter */ + mon_type_t mon_value; /*!< Current counter Value */ + mon_type_t mon_max_value; /*!< Current Max value */ + mon_type_t mon_min_value; /*!< Current Min value */ + mon_type_t mon_value_reset;/*!< value at last reset */ + mon_type_t mon_max_value_start; /*!< Max value since start */ + mon_type_t mon_min_value_start; /*!< Min value since start */ + mon_type_t mon_start_value;/*!< Value at the start time */ + mon_type_t mon_last_value; /*!< Last set of values */ + monitor_running_t mon_status; /* whether monitor still running */ +}; + +/** Follwoing defines are possible values for "monitor_type" field in +"struct monitor_info" */ +enum monitor_type_t { + MONITOR_NONE = 0, /*!< No monitoring */ + MONITOR_MODULE = 1, /*!< This is a monitor module type, + not a counter */ + MONITOR_EXISTING = 2, /*!< The monitor carries information from + an existing system status variable */ + MONITOR_NO_AVERAGE = 4, /*!< Set this status if we don't want to + calculate the average value for the counter */ + MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the + counter, rather than incremental value + over the period. Mostly for counters + displaying current resource usage */ + MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off + only as a module, but not individually */ + MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at + server start up */ + MONITOR_SET_OWNER = 64, /*!< Owner of "monitor set", a set of + monitor counters */ + MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */ + MONITOR_HIDDEN = 256 /*!< Do not display this monitor in the + metrics table */ +}; + +/** Counter minimum value is initialized to be max value of + mon_type_t (int64_t) */ +#ifndef INT64_MAX +#define INT64_MAX (9223372036854775807LL) +#endif +#ifndef INT64_MIN +#define INT64_MIN (-9223372036854775807LL-1) +#endif +#define MIN_RESERVED INT64_MAX +#define MAX_RESERVED INT64_MIN + +/** This enumeration defines internal monitor identifier used internally +to identify each particular counter. Its value indexes into two arrays, +one is the "innodb_counter_value" array which records actual monitor +counter values, the other is "innodb_counter_info" array which describes +each counter's basic information (name, desc etc.). A couple of +naming rules here: +1) If the monitor defines a module, it starts with MONITOR_MODULE +2) If the monitor uses exisitng counters from "status variable", its ID +name shall start with MONITOR_OVLD + +Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail +information for each monitor counter */ + +enum monitor_id_t { + /* This is to identify the default value set by the metrics + control global variables */ + MONITOR_DEFAULT_START = 0, + + /* Start of Metadata counter */ + MONITOR_MODULE_METADATA, + MONITOR_TABLE_OPEN, + MONITOR_TABLE_CLOSE, + MONITOR_TABLE_REFERENCE, + + /* Lock manager related counters */ + MONITOR_MODULE_LOCK, + MONITOR_DEADLOCK, + MONITOR_TIMEOUT, + MONITOR_LOCKREC_WAIT, + MONITOR_TABLELOCK_WAIT, + MONITOR_NUM_RECLOCK_REQ, + MONITOR_RECLOCK_CREATED, + MONITOR_RECLOCK_REMOVED, + MONITOR_NUM_RECLOCK, + MONITOR_TABLELOCK_CREATED, + MONITOR_TABLELOCK_REMOVED, + MONITOR_NUM_TABLELOCK, + MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT, + MONITOR_OVLD_LOCK_WAIT_TIME, + MONITOR_OVLD_LOCK_MAX_WAIT_TIME, + MONITOR_OVLD_ROW_LOCK_WAIT, + MONITOR_OVLD_LOCK_AVG_WAIT_TIME, + + /* Buffer and I/O realted counters. */ + MONITOR_MODULE_BUFFER, + MONITOR_OVLD_BUFFER_POOL_SIZE, + MONITOR_OVLD_BUF_POOL_READS, + MONITOR_OVLD_BUF_POOL_READ_REQUESTS, + MONITOR_OVLD_BUF_POOL_WRITE_REQUEST, + MONITOR_OVLD_BUF_POOL_WAIT_FREE, + MONITOR_OVLD_BUF_POOL_READ_AHEAD, + MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED, + MONITOR_OVLD_BUF_POOL_PAGE_TOTAL, + MONITOR_OVLD_BUF_POOL_PAGE_MISC, + MONITOR_OVLD_BUF_POOL_PAGES_DATA, + MONITOR_OVLD_BUF_POOL_BYTES_DATA, + MONITOR_OVLD_BUF_POOL_PAGES_DIRTY, + MONITOR_OVLD_BUF_POOL_BYTES_DIRTY, + MONITOR_OVLD_BUF_POOL_PAGES_FREE, + MONITOR_OVLD_PAGE_CREATED, + MONITOR_OVLD_PAGES_WRITTEN, + MONITOR_OVLD_INDEX_PAGES_WRITTEN, + MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN, + MONITOR_OVLD_PAGES_READ, + MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS, + MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED, + MONITOR_OVLD_BYTE_READ, + MONITOR_OVLD_BYTE_WRITTEN, + MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, + MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, + + MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, + MONITOR_FLUSH_ADAPTIVE_AVG_TIME, + + MONITOR_FLUSH_ADAPTIVE_AVG_PASS, + + MONITOR_LRU_GET_FREE_LOOPS, + MONITOR_LRU_GET_FREE_WAITS, + + MONITOR_FLUSH_AVG_PAGE_RATE, + MONITOR_FLUSH_LSN_AVG_RATE, + MONITOR_FLUSH_PCT_FOR_DIRTY, + MONITOR_FLUSH_PCT_FOR_LSN, + MONITOR_FLUSH_SYNC_WAITS, + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_COUNT, + MONITOR_LRU_BATCH_FLUSH_PAGES, + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, + MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT, + MONITOR_LRU_GET_FREE_SEARCH, + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + + /* Buffer Page I/O specific counters. */ + MONITOR_MODULE_BUF_PAGE, + MONITOR_INDEX_LEAF_PAGE_READ, + MONITOR_INDEX_NON_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ, + MONITOR_UNDO_LOG_PAGE_READ, + MONITOR_INODE_PAGE_READ, + MONITOR_IBUF_FREELIST_PAGE_READ, + MONITOR_IBUF_BITMAP_PAGE_READ, + MONITOR_SYSTEM_PAGE_READ, + MONITOR_TRX_SYSTEM_PAGE_READ, + MONITOR_FSP_HDR_PAGE_READ, + MONITOR_XDES_PAGE_READ, + MONITOR_BLOB_PAGE_READ, + MONITOR_ZBLOB_PAGE_READ, + MONITOR_ZBLOB2_PAGE_READ, + MONITOR_OTHER_PAGE_READ, + MONITOR_INDEX_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN, + MONITOR_UNDO_LOG_PAGE_WRITTEN, + MONITOR_INODE_PAGE_WRITTEN, + MONITOR_IBUF_FREELIST_PAGE_WRITTEN, + MONITOR_IBUF_BITMAP_PAGE_WRITTEN, + MONITOR_SYSTEM_PAGE_WRITTEN, + MONITOR_TRX_SYSTEM_PAGE_WRITTEN, + MONITOR_FSP_HDR_PAGE_WRITTEN, + MONITOR_XDES_PAGE_WRITTEN, + MONITOR_BLOB_PAGE_WRITTEN, + MONITOR_ZBLOB_PAGE_WRITTEN, + MONITOR_ZBLOB2_PAGE_WRITTEN, + MONITOR_OTHER_PAGE_WRITTEN, + + /* OS level counters (I/O) */ + MONITOR_MODULE_OS, + MONITOR_OVLD_OS_FILE_READ, + MONITOR_OVLD_OS_FILE_WRITE, + MONITOR_OVLD_OS_FSYNC, + MONITOR_OS_PENDING_READS, + MONITOR_OS_PENDING_WRITES, + MONITOR_OVLD_OS_LOG_WRITTEN, + MONITOR_OVLD_OS_LOG_FSYNC, + MONITOR_OVLD_OS_LOG_PENDING_FSYNC, + MONITOR_OVLD_OS_LOG_PENDING_WRITES, + + /* Transaction related counters */ + MONITOR_MODULE_TRX, + MONITOR_TRX_RW_COMMIT, + MONITOR_TRX_RO_COMMIT, + MONITOR_TRX_NL_RO_COMMIT, + MONITOR_TRX_COMMIT_UNDO, + MONITOR_TRX_ROLLBACK, + MONITOR_TRX_ROLLBACK_SAVEPOINT, + MONITOR_TRX_ACTIVE, + MONITOR_RSEG_HISTORY_LEN, + MONITOR_NUM_UNDO_SLOT_USED, + MONITOR_NUM_UNDO_SLOT_CACHED, + MONITOR_RSEG_CUR_SIZE, + + /* Purge related counters */ + MONITOR_MODULE_PURGE, + MONITOR_N_DEL_ROW_PURGE, + MONITOR_N_UPD_EXIST_EXTERN, + MONITOR_PURGE_INVOKED, + MONITOR_PURGE_N_PAGE_HANDLED, + MONITOR_DML_PURGE_DELAY, + MONITOR_PURGE_STOP_COUNT, + MONITOR_PURGE_RESUME_COUNT, + + /* Recovery related counters */ + MONITOR_MODULE_RECOVERY, + MONITOR_NUM_CHECKPOINT, + MONITOR_OVLD_LSN_FLUSHDISK, + MONITOR_OVLD_LSN_CHECKPOINT, + MONITOR_OVLD_LSN_CURRENT, + MONITOR_LSN_CHECKPOINT_AGE, + MONITOR_OVLD_BUF_OLDEST_LSN, + MONITOR_OVLD_MAX_AGE_ASYNC, + MONITOR_PENDING_LOG_FLUSH, + MONITOR_PENDING_CHECKPOINT_WRITE, + MONITOR_LOG_IO, + MONITOR_OVLD_LOG_WAITS, + MONITOR_OVLD_LOG_WRITE_REQUEST, + MONITOR_OVLD_LOG_WRITES, + MONITOR_OVLD_LOG_PADDED, + + /* Page Manager related counters */ + MONITOR_MODULE_PAGE, + MONITOR_PAGE_COMPRESS, + MONITOR_PAGE_DECOMPRESS, + MONITOR_PAD_INCREMENTS, + MONITOR_PAD_DECREMENTS, + /* New monitor variables for page compression */ + MONITOR_OVLD_PAGE_COMPRESS_SAVED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSED, + MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP, + MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR, + + /* New monitor variables for page encryption */ + MONITOR_OVLD_PAGES_ENCRYPTED, + MONITOR_OVLD_PAGES_DECRYPTED, + + /* Index related counters */ + MONITOR_MODULE_INDEX, + MONITOR_INDEX_SPLIT, + MONITOR_INDEX_MERGE_ATTEMPTS, + MONITOR_INDEX_MERGE_SUCCESSFUL, + MONITOR_INDEX_REORG_ATTEMPTS, + MONITOR_INDEX_REORG_SUCCESSFUL, + MONITOR_INDEX_DISCARD, + +#ifdef BTR_CUR_HASH_ADAPT + /* Adaptive Hash Index related counters */ + MONITOR_MODULE_ADAPTIVE_HASH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH, +#endif /* BTR_CUR_HASH_ADAPT */ + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE, +#ifdef BTR_CUR_HASH_ADAPT + MONITOR_ADAPTIVE_HASH_PAGE_ADDED, + MONITOR_ADAPTIVE_HASH_PAGE_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_ADDED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND, + MONITOR_ADAPTIVE_HASH_ROW_UPDATED, +#endif /* BTR_CUR_HASH_ADAPT */ + + /* Tablespace related counters */ + MONITOR_MODULE_FIL_SYSTEM, + MONITOR_OVLD_N_FILE_OPENED, + + /* InnoDB Change Buffer related counters */ + MONITOR_MODULE_IBUF_SYSTEM, + MONITOR_OVLD_IBUF_MERGE_INSERT, + MONITOR_OVLD_IBUF_MERGE_DELETE, + MONITOR_OVLD_IBUF_MERGE_PURGE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT, + MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE, + MONITOR_OVLD_IBUF_MERGES, + MONITOR_OVLD_IBUF_SIZE, + + /* Counters for server operations */ + MONITOR_MODULE_SERVER, + MONITOR_MASTER_THREAD_SLEEP, + MONITOR_OVLD_SERVER_ACTIVITY, + MONITOR_MASTER_ACTIVE_LOOPS, + MONITOR_MASTER_IDLE_LOOPS, + MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, + MONITOR_SRV_LOG_FLUSH_MICROSECOND, + MONITOR_SRV_DICT_LRU_MICROSECOND, + MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE, + MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE, + MONITOR_OVLD_SRV_DBLWR_WRITES, + MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN, + MONITOR_OVLD_SRV_PAGE_SIZE, + MONITOR_OVLD_RWLOCK_S_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_X_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_S_OS_WAITS, + MONITOR_OVLD_RWLOCK_X_OS_WAITS, + MONITOR_OVLD_RWLOCK_SX_OS_WAITS, + + /* Data DML related counters */ + MONITOR_MODULE_DML_STATS, + MONITOR_OLVD_ROW_READ, + MONITOR_OLVD_ROW_INSERTED, + MONITOR_OLVD_ROW_DELETED, + MONITOR_OLVD_ROW_UPDTATED, + MONITOR_OLVD_SYSTEM_ROW_READ, + MONITOR_OLVD_SYSTEM_ROW_INSERTED, + MONITOR_OLVD_SYSTEM_ROW_DELETED, + MONITOR_OLVD_SYSTEM_ROW_UPDATED, + + /* Data DDL related counters */ + MONITOR_MODULE_DDL_STATS, + MONITOR_BACKGROUND_DROP_INDEX, + MONITOR_BACKGROUND_DROP_TABLE, + MONITOR_ONLINE_CREATE_INDEX, + MONITOR_PENDING_ALTER_TABLE, + MONITOR_ALTER_TABLE_SORT_FILES, + MONITOR_ALTER_TABLE_LOG_FILES, + + MONITOR_MODULE_ICP, + MONITOR_ICP_ATTEMPTS, + MONITOR_ICP_NO_MATCH, + MONITOR_ICP_OUT_OF_RANGE, + MONITOR_ICP_MATCH, + + /* Mutex/RW-Lock related counters */ + MONITOR_MODULE_LATCHES, + MONITOR_LATCHES, + + /* This is used only for control system to turn + on/off and reset all monitor counters */ + MONITOR_ALL_COUNTER, + + /* This must be the last member */ + NUM_MONITOR +}; + +/** This informs the monitor control system to turn +on/off and reset monitor counters through wild card match */ +#define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1) + +/** Cannot find monitor counter with a specified name */ +#define MONITOR_NO_MATCH (NUM_MONITOR + 2) + +/** struct monitor_info describes the basic/static information +about each monitor counter. */ +struct monitor_info_t { + const char* monitor_name; /*!< Monitor name */ + const char* monitor_module; /*!< Sub Module the monitor + belongs to */ + const char* monitor_desc; /*!< Brief desc of monitor counter */ + monitor_type_t monitor_type; /*!< Type of Monitor Info */ + monitor_id_t monitor_related_id;/*!< Monitor ID of counter that + related to this monitor. This is + set when the monitor belongs to + a "monitor set" */ + monitor_id_t monitor_id; /*!< Monitor ID as defined in enum + monitor_id_t */ +}; + +/** Following are the "set_option" values allowed for +srv_mon_process_existing_counter() and srv_mon_process_existing_counter() +functions. To turn on/off/reset the monitor counters. */ +enum mon_option_t { + MONITOR_TURN_ON = 1, /*!< Turn on the counter */ + MONITOR_TURN_OFF, /*!< Turn off the counter */ + MONITOR_RESET_VALUE, /*!< Reset current values */ + MONITOR_RESET_ALL_VALUE, /*!< Reset all values */ + MONITOR_GET_VALUE /*!< Option for + srv_mon_process_existing_counter() + function */ +}; + +/** Number of bit in a ulint datatype */ +#define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT) + +/** This "monitor_set_tbl" is a bitmap records whether a particular monitor +counter has been turned on or off */ +extern Atomic_relaxed<ulint> + monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT]; + +/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor +counter option. */ +#define MONITOR_ON(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_or( \ + (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))) + +#define MONITOR_OFF(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_and( \ + ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))) + +/** Check whether the requested monitor is turned on/off */ +#define MONITOR_IS_ON(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] & \ + (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))) + +/** The actual monitor counter array that records each monintor counter +value */ +extern monitor_value_t innodb_counter_value[NUM_MONITOR]; + +/** Following are macro defines for basic montior counter manipulations. +Please note we do not provide any synchronization for these monitor +operations due to performance consideration. Most counters can +be placed under existing mutex protections in respective code +module. */ + +/** Macros to access various fields of a monitor counters */ +#define MONITOR_FIELD(monitor, field) \ + (innodb_counter_value[monitor].field) + +#define MONITOR_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_value) + +#define MONITOR_MAX_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_max_value) + +#define MONITOR_MIN_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_min_value) + +#define MONITOR_VALUE_RESET(monitor) \ + MONITOR_FIELD(monitor, mon_value_reset) + +#define MONITOR_MAX_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_max_value_start) + +#define MONITOR_MIN_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_min_value_start) + +#define MONITOR_LAST_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_last_value) + +#define MONITOR_START_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_start_value) + +#define MONITOR_VALUE_SINCE_START(monitor) \ + (MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor)) + +#define MONITOR_STATUS(monitor) \ + MONITOR_FIELD(monitor, mon_status) + +#define MONITOR_SET_START(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STARTED; \ + MONITOR_FIELD((monitor), mon_start_time) = time(NULL); \ + } while (0) + +#define MONITOR_SET_OFF(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STOPPED; \ + MONITOR_FIELD((monitor), mon_stop_time) = time(NULL); \ + } while (0) + +#define MONITOR_INIT_ZERO_VALUE 0 + +/** Max and min values are initialized when we first turn on the monitor +counter, and set the MONITOR_STATUS. */ +#define MONITOR_MAX_MIN_NOT_INIT(monitor) \ + (MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE) + +#define MONITOR_INIT(monitor) \ + if (MONITOR_MAX_MIN_NOT_INIT(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + } + +/** Macros to increment/decrement the counters. The normal +monitor counter operation expects appropriate synchronization +already exists. No additional mutex is necessary when operating +on the counters */ +#define MONITOR_INC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 +@param enabled whether the monitor is enabled */ +#define MONITOR_ATOMIC_INC_LOW(monitor, enabled) \ + if (enabled) { \ + ib_uint64_t value; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), 1, \ + MY_MEMORY_ORDER_RELAXED) + 1; \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = value; \ + } \ + } + +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 +@param enabled whether the monitor is enabled */ +#define MONITOR_ATOMIC_DEC_LOW(monitor, enabled) \ + if (enabled) { \ + ib_uint64_t value; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), -1, \ + MY_MEMORY_ORDER_RELAXED) - 1; \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = value; \ + } \ + } + +/** Atomically increment a monitor counter if it is enabled. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +#define MONITOR_ATOMIC_INC(monitor) \ + MONITOR_ATOMIC_INC_LOW(monitor, MONITOR_IS_ON(monitor)) +/** Atomically decrement a monitor counter if it is enabled. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +#define MONITOR_ATOMIC_DEC(monitor) \ + MONITOR_ATOMIC_DEC_LOW(monitor, MONITOR_IS_ON(monitor)) + +#define MONITOR_DEC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#ifdef HAVE_MEM_CHECK +# define MONITOR_CHECK_DEFINED(value) do { \ + mon_type_t m __attribute__((unused))= value; \ + MEM_CHECK_DEFINED(&m, sizeof m); \ +} while (0) +#else /* HAVE_MEM_CHECK */ +# define MONITOR_CHECK_DEFINED(value) (void) 0 +#endif /* HAVE_MEM_CHECK */ + +#define MONITOR_INC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#define MONITOR_DEC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \ + MONITOR_VALUE(monitor) -= (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/* Increment/decrement counter without check the monitor on/off bit, which +could already be checked as a module group */ +#define MONITOR_INC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) \ + +#define MONITOR_DEC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) + +/** Directly set a monitor counter's value */ +#define MONITOR_SET(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Add time difference between now and input "value" (in seconds) to the +monitor counter +@param monitor monitor to update for the time difference +@param value the start time value */ +#define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + uintmax_t old_time = value; \ + value = microsecond_interval_timer(); \ + MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\ + } + +/** This macro updates 3 counters in one call. However, it only checks the +main/first monitor counter 'monitor', to see it is on or off to decide +whether to do the update. +@param monitor the main monitor counter to update. It accounts for + the accumulative value for the counter. +@param monitor_n_calls counter that counts number of times this macro is + called +@param monitor_per_call counter that records the current and max value of + each incremental value +@param value incremental value to record this time */ +#define MONITOR_INC_VALUE_CUMULATIVE( \ + monitor, monitor_n_calls, monitor_per_call, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor_n_calls)++; \ + MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor_per_call) \ + > MONITOR_MAX_VALUE(monitor_per_call)) { \ + MONITOR_MAX_VALUE(monitor_per_call) = \ + (mon_type_t) (value); \ + } \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Directly set a monitor counter's value, and if the value +is monotonically increasing, only max value needs to be updated */ +#define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Some values such as log sequence number are montomically increasing +number, do not need to record max/min values */ +#define MONITOR_SET_SIMPLE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + } + +/** Reset the monitor value and max/min value to zero. The reset +operation would only be conducted when the counter is turned off */ +#define MONITOR_RESET_ALL(monitor) \ + do { \ + MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_start_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_stop_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_reset_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + } while (0) + +/** Following four macros defines necessary operations to fetch and +consolidate information from existing system status variables. */ + +/** Save the passed-in value to mon_start_value field of monitor +counters */ +#define MONITOR_SAVE_START(monitor, value) do { \ + MONITOR_CHECK_DEFINED(value); \ + (MONITOR_START_VALUE(monitor) = \ + (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \ + } while (0) + +/** Save the passed-in value to mon_last_value field of monitor +counters */ +#define MONITOR_SAVE_LAST(monitor) \ + do { \ + MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor); \ + MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor); \ + } while (0) + +/** Set monitor value to the difference of value and mon_start_value +compensated by mon_last_value if accumulated value is required. */ +#define MONITOR_SET_DIFF(monitor, value) \ + MONITOR_SET_UPD_MAX_ONLY(monitor, ((value) \ + - MONITOR_VALUE_RESET(monitor) \ + - MONITOR_FIELD(monitor, mon_start_value) \ + + MONITOR_FIELD(monitor, mon_last_value))) + +/****************************************************************//** +Get monitor's monitor_info_t by its monitor id (index into the +innodb_counter_info array +@return Point to corresponding monitor_info_t, or NULL if no such +monitor */ +monitor_info_t* +srv_mon_get_info( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ +/****************************************************************//** +Get monitor's name by its monitor id (index into the +innodb_counter_info array +@return corresponding monitor name, or NULL if no such +monitor */ +const char* +srv_mon_get_name( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ + +/****************************************************************//** +Turn on/off/reset monitor counters in a module. If module_value +is NUM_MONITOR then turn on all monitor counters. +@return 0 if successful, or the first monitor that cannot be +turned on because it is already turned on. */ +void +srv_mon_set_module_control( +/*=======================*/ + monitor_id_t module_id, /*!< in: Module ID as in + monitor_counter_id. If it is + set to NUM_MONITOR, this means + we shall turn on all the counters */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/****************************************************************//** +This function consolidates some existing server counters used +by "system status variables". These existing system variables do not have +mechanism to start/stop and reset the counters, so we simulate these +controls by remembering the corresponding counter values when the +corresponding monitors are turned on/off/reset, and do appropriate +mathematics to deduct the actual value. */ +void +srv_mon_process_existing_counter( +/*=============================*/ + monitor_id_t monitor_id, /*!< in: the monitor's ID as in + monitor_counter_id */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id */ +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Reset a monitor, create a new base line with the current monitor +value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */ +void +srv_mon_reset( +/*==========*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Turn on monitor counters that are marked as default ON. */ +void +srv_mon_default_on(void); +/*====================*/ + +#include "srv0mon.ic" + +#endif diff --git a/storage/innobase/include/srv0mon.ic b/storage/innobase/include/srv0mon.ic new file mode 100644 index 00000000..158345b2 --- /dev/null +++ b/storage/innobase/include/srv0mon.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/srv0mon.ic +Server monitoring system + +Created 1/20/2010 Jimmy Yang +************************************************************************/ + +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) { + + /* MONITOR_MAX_VALUE_START has not yet been + initialized, the max value since start is the + max count in MONITOR_MAX_VALUE */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor); + + } else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED + && (MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + > MONITOR_MAX_VALUE_START(monitor))) { + + /* If the max value since reset (as specified + in MONITOR_MAX_VALUE) plus the reset value is + larger than MONITOR_MAX_VALUE_START, reset + MONITOR_MAX_VALUE_START to this new max value */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MAX_VALUE_START(monitor)); +} + +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) { + + /* MONITOR_MIN_VALUE_START has not yet been + initialized, the min value since start is the + min count in MONITOR_MIN_VALUE */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor); + + } else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED + && (MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + < MONITOR_MIN_VALUE_START(monitor))) { + + /* If the min value since reset (as specified + in MONITOR_MIN_VALUE) plus the reset value is + less than MONITOR_MIN_VALUE_START, reset + MONITOR_MIN_VALUE_START to this new min value */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MIN_VALUE_START(monitor)); +} + +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + /* Do not reset all counter values if monitor is still on. */ + if (MONITOR_IS_ON(monitor)) { + fprintf(stderr, "InnoDB: Cannot reset all values for" + " monitor counter %s while it is on. Please" + " turn it off and retry.\n", + srv_mon_get_name(monitor)); + } else { + MONITOR_RESET_ALL(monitor); + } +} diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h new file mode 100644 index 00000000..a5bebc34 --- /dev/null +++ b/storage/innobase/include/srv0srv.h @@ -0,0 +1,868 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2008, 2009, Google Inc. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0srv.h +The server main program + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "log0log.h" +#include "que0types.h" +#include "trx0types.h" +#include "fil0fil.h" + +#include "mysql/psi/mysql_stage.h" +#include "mysql/psi/psi.h" +#include <tpool.h> +#include <memory> + +/** Global counters used inside InnoDB. */ +struct srv_stats_t +{ + typedef ib_counter_t<ulint, 64> ulint_ctr_64_t; + typedef simple_counter<lsn_t> lsn_ctr_1_t; + typedef simple_counter<ulint> ulint_ctr_1_t; + typedef simple_counter<int64_t> int64_ctr_1_t; + + /** Count the amount of data written in total (in bytes) */ + ulint_ctr_1_t data_written; + + /** Number of the log write requests done */ + ulint_ctr_1_t log_write_requests; + + /** Number of physical writes to the log performed */ + ulint_ctr_1_t log_writes; + + /** Amount of data padded for log write ahead */ + ulint_ctr_1_t log_padded; + + /** Amount of data written to the log files in bytes */ + lsn_ctr_1_t os_log_written; + + /** Number of writes being done to the log files */ + ulint_ctr_1_t os_log_pending_writes; + + /** We increase this counter, when we don't have enough + space in the log buffer and have to flush it */ + ulint_ctr_1_t log_waits; + + /** Store the number of write requests issued */ + ulint_ctr_1_t buf_pool_write_requests; + + /** Number of buffer pool reads that led to the reading of + a disk page */ + ulint_ctr_1_t buf_pool_reads; + + /** Number of bytes saved by page compression */ + ulint_ctr_64_t page_compression_saved; + /* Number of index pages written */ + ulint_ctr_64_t index_pages_written; + /* Number of non index pages written */ + ulint_ctr_64_t non_index_pages_written; + /* Number of pages compressed with page compression */ + ulint_ctr_64_t pages_page_compressed; + /* Number of TRIM operations induced by page compression */ + ulint_ctr_64_t page_compressed_trim_op; + /* Number of pages decompressed with page compression */ + ulint_ctr_64_t pages_page_decompressed; + /* Number of page compression errors */ + ulint_ctr_64_t pages_page_compression_error; + /* Number of pages encrypted */ + ulint_ctr_64_t pages_encrypted; + /* Number of pages decrypted */ + ulint_ctr_64_t pages_decrypted; + /* Number of merge blocks encrypted */ + ulint_ctr_64_t n_merge_blocks_encrypted; + /* Number of merge blocks decrypted */ + ulint_ctr_64_t n_merge_blocks_decrypted; + /* Number of row log blocks encrypted */ + ulint_ctr_64_t n_rowlog_blocks_encrypted; + /* Number of row log blocks decrypted */ + ulint_ctr_64_t n_rowlog_blocks_decrypted; + + /** Number of data read in total (in bytes) */ + ulint_ctr_1_t data_read; + + /** Wait time of database locks */ + int64_ctr_1_t n_lock_wait_time; + + /** Number of database lock waits */ + ulint_ctr_1_t n_lock_wait_count; + + /** Number of threads currently waiting on database locks */ + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<ulint> + n_lock_wait_current_count; + + /** Number of rows read. */ + ulint_ctr_64_t n_rows_read; + + /** Number of rows updated */ + ulint_ctr_64_t n_rows_updated; + + /** Number of rows deleted */ + ulint_ctr_64_t n_rows_deleted; + + /** Number of rows inserted */ + ulint_ctr_64_t n_rows_inserted; + + /** Number of system rows read. */ + ulint_ctr_64_t n_system_rows_read; + + /** Number of system rows updated */ + ulint_ctr_64_t n_system_rows_updated; + + /** Number of system rows deleted */ + ulint_ctr_64_t n_system_rows_deleted; + + /** Number of system rows inserted */ + ulint_ctr_64_t n_system_rows_inserted; + + /** Number of times secondary index lookup triggered cluster lookup */ + ulint_ctr_64_t n_sec_rec_cluster_reads; + + /** Number of times prefix optimization avoided triggering cluster lookup */ + ulint_ctr_64_t n_sec_rec_cluster_reads_avoided; + + /** Number of encryption_get_latest_key_version calls */ + ulint_ctr_64_t n_key_requests; + + /** Number of spaces in keyrotation list */ + ulint_ctr_64_t key_rotation_list_length; + + /** Number of temporary tablespace blocks encrypted */ + ulint_ctr_64_t n_temp_blocks_encrypted; + + /** Number of temporary tablespace blocks decrypted */ + ulint_ctr_64_t n_temp_blocks_decrypted; + + /** Number of lock deadlocks */ + ulint_ctr_1_t lock_deadlock_count; +}; + +/** We are prepared for a situation that we have this many threads waiting for +a semaphore inside InnoDB. srv_start() sets the value. */ +extern ulint srv_max_n_threads; + +extern const char* srv_main_thread_op_info; + +/** Prefix used by MySQL to indicate pre-5.1 table name encoding */ +extern const char srv_mysql50_table_name_prefix[10]; + +/** The buffer pool dump/load file name */ +#define SRV_BUF_DUMP_FILENAME_DEFAULT "ib_buffer_pool" +extern char* srv_buf_dump_filename; + +/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown +and/or load it during startup. */ +extern char srv_buffer_pool_dump_at_shutdown; +extern char srv_buffer_pool_load_at_startup; + +/* Whether to disable file system cache if it is defined */ +extern char srv_disable_sort_file_cache; + +/* If the last data file is auto-extended, we add this many pages to it +at a time */ +#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment()) + +/** Mutex protecting page_zip_stat_per_index */ +extern ib_mutex_t page_zip_stat_per_index_mutex; +/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ +extern ib_mutex_t srv_monitor_file_mutex; +/* Temporary file for innodb monitor output */ +extern FILE* srv_monitor_file; +/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode. +This mutex has a very low rank; threads reserving it should not +acquire any further latches or sleep before releasing this one. */ +extern ib_mutex_t srv_misc_tmpfile_mutex; +/* Temporary file for miscellanous diagnostic output */ +extern FILE* srv_misc_tmpfile; + +/* Server parameters which are read from the initfile */ + +extern char* srv_data_home; + +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +extern my_bool srv_read_only_mode; +/** Set if InnoDB operates in read-only mode or innodb-force-recovery +is greater than SRV_FORCE_NO_IBUF_MERGE. */ +extern my_bool high_level_read_only; +/** store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +extern my_bool srv_file_per_table; + +/** Sort buffer size in index creation */ +extern ulong srv_sort_buf_size; +/** Maximum modification log file size for online index creation */ +extern unsigned long long srv_online_max_size; + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio. +Currently we support native aio on windows and linux */ +extern my_bool srv_use_native_aio; +extern my_bool srv_numa_interleave; + +/* Use atomic writes i.e disable doublewrite buffer */ +extern my_bool srv_use_atomic_writes; + +/* Compression algorithm*/ +extern ulong innodb_compression_algorithm; + +/** TRUE if the server was successfully started */ +extern bool srv_was_started; + +/** Server undo tablespaces directory, can be absolute path. */ +extern char* srv_undo_dir; + +/** Number of undo tablespaces to use. */ +extern ulong srv_undo_tablespaces; + +/** The number of UNDO tablespaces that are active (hosting some rollback +segment). It is quite possible that some of the tablespaces doesn't host +any of the rollback-segment based on configuration used. */ +extern ulint srv_undo_tablespaces_active; + +/** Maximum size of undo tablespace. */ +extern unsigned long long srv_max_undo_log_size; + +extern uint srv_n_fil_crypt_threads; +extern uint srv_n_fil_crypt_threads_started; + +/** Rate at which UNDO records should be purged. */ +extern ulong srv_purge_rseg_truncate_frequency; + +/** Enable or Disable Truncate of UNDO tablespace. */ +extern my_bool srv_undo_log_truncate; + +/* Optimize prefix index queries to skip cluster index lookup when possible */ +/* Enables or disables this prefix optimization. Disabled by default. */ +extern my_bool srv_prefix_index_cluster_optimization; + +/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */ +constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) / + UNIV_PAGE_SIZE_DEF; + +extern char* srv_log_group_home_dir; + +/** The InnoDB redo log file size, or 0 when changing the redo log format +at startup (while disallowing writes to the redo log). */ +extern ulonglong srv_log_file_size; +extern ulong srv_log_buffer_size; +extern ulong srv_flush_log_at_trx_commit; +extern uint srv_flush_log_at_timeout; +extern ulong srv_log_write_ahead_size; +extern my_bool srv_adaptive_flushing; +extern my_bool srv_flush_sync; + +#ifdef WITH_INNODB_DISALLOW_WRITES +/* When this event is reset we do not allow any file writes to take place. */ +extern os_event_t srv_allow_writes_event; +#endif /* WITH_INNODB_DISALLOW_WRITES */ + +/* If this flag is TRUE, then we will load the indexes' (and tables') metadata +even if they are marked as "corrupted". Mostly it is for DBA to process +corrupted index and table */ +extern my_bool srv_load_corrupted; + +/** Requested size in bytes */ +extern ulint srv_buf_pool_size; +/** Minimum pool size in bytes */ +extern const ulint srv_buf_pool_min_size; +/** Default pool size in bytes */ +extern const ulint srv_buf_pool_def_size; +/** Requested buffer pool chunk size */ +extern ulong srv_buf_pool_chunk_unit; +/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ +extern ulong srv_LRU_scan_depth; +/** Whether or not to flush neighbors of a block */ +extern ulong srv_flush_neighbors; +/** Previously requested size */ +extern ulint srv_buf_pool_old_size; +/** Current size as scaling factor for the other components */ +extern ulint srv_buf_pool_base_size; +/** Current size in bytes */ +extern ulint srv_buf_pool_curr_size; +/** Dump this % of each buffer pool during BP dump */ +extern ulong srv_buf_pool_dump_pct; +#ifdef UNIV_DEBUG +/** Abort load after this amount of pages */ +extern ulong srv_buf_pool_load_pages_abort; +#endif +/** Lock table size in bytes */ +extern ulint srv_lock_table_size; + +extern uint srv_n_file_io_threads; +extern my_bool srv_random_read_ahead; +extern ulong srv_read_ahead_threshold; +extern uint srv_n_read_io_threads; +extern uint srv_n_write_io_threads; + +/* Defragmentation, Origianlly facebook default value is 100, but it's too high */ +#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 +extern my_bool srv_defragment; +extern uint srv_defragment_n_pages; +extern uint srv_defragment_stats_accuracy; +extern uint srv_defragment_fill_factor_n_recs; +extern double srv_defragment_fill_factor; +extern uint srv_defragment_frequency; +extern ulonglong srv_defragment_interval; + +extern uint srv_change_buffer_max_size; + +/* Number of IO operations per second the server can do */ +extern ulong srv_io_capacity; + +/* We use this dummy default value at startup for max_io_capacity. +The real value is set based on the value of io_capacity. */ +#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL) +#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL) +extern ulong srv_max_io_capacity; + +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + +extern ulint srv_max_n_open_files; + +extern double srv_max_buf_pool_modified_pct; +extern double srv_max_dirty_pages_pct_lwm; + +extern double srv_adaptive_flushing_lwm; +extern ulong srv_flushing_avg_loops; + +extern ulong srv_force_recovery; + +/** innodb_fast_shutdown=1 skips purge and change buffer merge. +innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint). +innodb_fast_shutdown=3 is a clean shutdown that skips the rollback +of active transaction (to be done on restart). */ +extern uint srv_fast_shutdown; + +extern ibool srv_innodb_status; + +extern unsigned long long srv_stats_transient_sample_pages; +extern my_bool srv_stats_persistent; +extern unsigned long long srv_stats_persistent_sample_pages; +extern my_bool srv_stats_auto_recalc; +extern my_bool srv_stats_include_delete_marked; +extern unsigned long long srv_stats_modified_counter; +extern my_bool srv_stats_sample_traditional; + +extern my_bool srv_use_doublewrite_buf; +extern ulong srv_checksum_algorithm; + +extern my_bool srv_force_primary_key; + +extern ulong srv_max_purge_lag; +extern ulong srv_max_purge_lag_delay; + +extern my_bool innodb_encrypt_temporary_tables; + +extern my_bool srv_immediate_scrub_data_uncompressed; +/*-------------------------------------------*/ + +/** Modes of operation */ +enum srv_operation_mode { + /** Normal mode (MariaDB Server) */ + SRV_OPERATION_NORMAL, + /** Mariabackup taking a backup */ + SRV_OPERATION_BACKUP, + /** Mariabackup restoring a backup for subsequent --copy-back */ + SRV_OPERATION_RESTORE, + /** Mariabackup restoring the incremental part of a backup */ + SRV_OPERATION_RESTORE_DELTA, + /** Mariabackup restoring a backup for subsequent --export */ + SRV_OPERATION_RESTORE_EXPORT +}; + +/** Current mode of operation */ +extern enum srv_operation_mode srv_operation; + +extern my_bool srv_print_innodb_monitor; +extern my_bool srv_print_innodb_lock_monitor; +extern ibool srv_print_verbose_log; + +extern bool srv_monitor_active; + + +extern ulong srv_n_spin_wait_rounds; +extern uint srv_spin_wait_delay; + +extern ulint srv_truncated_status_writes; +/** Number of initialized rollback segments for persistent undo log */ +extern ulong srv_available_undo_logs; +/** Iterations of the loop bounded by 'srv_active' label. */ +extern ulint srv_main_active_loops; +/** Iterations of the loop bounded by the 'srv_idle' label. */ +extern ulint srv_main_idle_loops; +/** Log writes involving flush. */ +extern ulint srv_log_writes_and_flush; + +#ifdef UNIV_DEBUG +extern my_bool innodb_evict_tables_on_commit_debug; +extern my_bool srv_sync_debug; +extern my_bool srv_purge_view_update_only_debug; + +/** Value of MySQL global used to disable master thread. */ +extern my_bool srv_master_thread_disabled_debug; +/** InnoDB system tablespace to set during recovery */ +extern uint srv_sys_space_size_debug; +/** whether redo log file has been created at startup */ +extern bool srv_log_file_created; +#endif /* UNIV_DEBUG */ + +extern ulint srv_dml_needed_delay; + +#define SRV_MAX_N_IO_THREADS 130 + +/** innodb_purge_threads; the number of purge tasks to use */ +extern uint srv_n_purge_threads; + +/* the number of pages to purge in one batch */ +extern ulong srv_purge_batch_size; + +/* the number of sync wait arrays */ +extern ulong srv_sync_array_size; + +/* print all user-level transactions deadlocks to mysqld stderr */ +extern my_bool srv_print_all_deadlocks; + +extern my_bool srv_cmp_per_index_enabled; + +/* is encryption enabled */ +extern ulong srv_encrypt_tables; + +/** Status variables to be passed to MySQL */ +extern struct export_var_t export_vars; + +/** Global counters */ +extern srv_stats_t srv_stats; + +/** Fatal semaphore wait threshold = maximum number of seconds +that semaphore times out in InnoDB */ +#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600 +extern ulong srv_fatal_semaphore_wait_threshold; + +/** Buffer pool dump status frequence in percentages */ +extern ulong srv_buf_dump_status_frequency; + +# ifdef UNIV_PFS_THREAD +extern mysql_pfs_key_t page_cleaner_thread_key; +extern mysql_pfs_key_t trx_rollback_clean_thread_key; +extern mysql_pfs_key_t thread_pool_thread_key; + +/* This macro register the current thread and its key with performance +schema */ +# define pfs_register_thread(key) \ +do { \ + struct PSI_thread* psi __attribute__((unused)) \ + = PSI_CALL_new_thread(key, NULL, 0); \ + PSI_CALL_set_thread_os_id(psi); \ + PSI_CALL_set_thread(psi); \ +} while (0) + +/* This macro delist the current thread from performance schema */ +# define pfs_delete_thread() \ +do { \ + PSI_CALL_delete_current_thread(); \ +} while (0) +# else +# define pfs_register_thread(key) +# define pfs_delete_thread() +# endif /* UNIV_PFS_THREAD */ + +#ifdef HAVE_PSI_STAGE_INTERFACE +/** Performance schema stage event for monitoring ALTER TABLE progress +everything after flush log_make_checkpoint(). */ +extern PSI_stage_info srv_stage_alter_table_end; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_merge_insert_index_tuples(). */ +extern PSI_stage_info srv_stage_alter_table_insert; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_log_apply(). */ +extern PSI_stage_info srv_stage_alter_table_log_index; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_log_table_apply(). */ +extern PSI_stage_info srv_stage_alter_table_log_table; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_merge_sort(). */ +extern PSI_stage_info srv_stage_alter_table_merge_sort; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_merge_read_clustered_index(). */ +extern PSI_stage_info srv_stage_alter_table_read_pk_internal_sort; + +/** Performance schema stage event for monitoring buffer pool load progress. */ +extern PSI_stage_info srv_stage_buffer_pool_load; +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +/** Alternatives for srv_force_recovery. Non-zero values are intended +to help the user get a damaged database up so that he can dump intact +tables and rows with SELECT INTO OUTFILE. The database must not otherwise +be used with these options! A bigger number below means that all precautions +of lower numbers are included. */ +enum { + SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it + detects a corrupt page */ + SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from + running: if a crash would occur + in purge, this prevents it */ + SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after + recovery */ + SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations: + if they would cause a crash, better + not do them */ + SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when + starting the database: InnoDB will + treat even incomplete transactions + as committed */ + SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward + in connection with recovery */ +}; + +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + +/*********************************************************************//** +Boots Innobase server. */ +void +srv_boot(void); +/*==========*/ +/*********************************************************************//** +Frees the data structures created in srv_init(). */ +void +srv_free(void); + +/** Wake up the purge if there is work to do. */ +void +srv_wake_purge_thread_if_not_active(); + +/******************************************************************//** +Outputs to a file the output of the InnoDB Monitor. +@return FALSE if not all information printed +due to failure to obtain necessary mutex */ +ibool +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /*!< in: output stream */ + ibool nowait, /*!< in: whether to wait for the + lock_sys_t::mutex */ + ulint* trx_start, /*!< out: file position of the start of + the list of active transactions */ + ulint* trx_end); /*!< out: file position of the end of + the list of active transactions */ + +/******************************************************************//** +Function to pass InnoDB status variables to MySQL */ +void +srv_export_innodb_status(void); +/*==========================*/ +/*******************************************************************//** +Get current server activity count. +@return activity count. */ +ulint +srv_get_activity_count(void); +/*========================*/ + +/******************************************************************//** +Increment the server activity counter. */ +void +srv_inc_activity_count(void); +/*=========================*/ + +/**********************************************************************//** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread */ + +/** +Flag which is set, whenever innodb_purge_threads changes. +It is read and reset in srv_do_purge(). + +Thus it is Atomic_counter<int>, not bool, since unprotected +reads are used. We just need an atomic with relaxed memory +order, to please Thread Sanitizer. +*/ +extern Atomic_counter<int> srv_purge_thread_count_changed; + +#ifdef UNIV_DEBUG +/** @return whether purge or master task is active */ +bool srv_any_background_activity(); +#endif + +extern "C" { + + +/** Periodic task which prints the info output by various InnoDB monitors.*/ +void srv_monitor_task(void*); + + +/** The periodic master task controlling the server. */ +void srv_master_callback(void*); + + +/** +Complete the shutdown tasks such as background DROP TABLE, +and optionally change buffer merge (on innodb_fast_shutdown=0). */ +void srv_shutdown(bool ibuf_merge); + +} /* extern "C" */ + +#ifdef UNIV_DEBUG +/** @return number of tasks in queue */ +ulint srv_get_task_queue_length(); +#endif + +/** Shut down the purge threads. */ +void srv_purge_shutdown(); + +/** Init purge tasks*/ +void srv_init_purge_tasks(); + +#ifdef UNIV_DEBUG +/** Disables master thread. It's used by: + SET GLOBAL innodb_master_thread_disabled_debug = 1 (0). +@param[in] save immediate result from check function */ +void +srv_master_thread_disabled_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save); +#endif /* UNIV_DEBUG */ + +/** Status variables to be passed to MySQL */ +struct export_var_t{ + char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */ + char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */ + char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ + my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ + ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ + ulint innodb_buffer_pool_pages_data; /*!< Data pages */ + ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ + ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */ + ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */ + ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ + ulint innodb_buffer_pool_pages_free; /*!< Free pages */ +#ifdef UNIV_DEBUG + ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ +#endif /* UNIV_DEBUG */ + ulint innodb_buffer_pool_pages_made_not_young; + ulint innodb_buffer_pool_pages_made_young; + ulint innodb_buffer_pool_pages_old; + ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */ + ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */ + ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */ + ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ + ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ + ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ + ulint innodb_checkpoint_age; + ulint innodb_checkpoint_max_age; + ulint innodb_data_pending_reads; /*!< Pending reads */ + ulint innodb_data_pending_writes; /*!< Pending writes */ + ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */ + ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */ + ulint innodb_data_read; /*!< Data bytes read */ + ulint innodb_data_writes; /*!< I/O write requests */ + ulint innodb_data_written; /*!< Data bytes written */ + ulint innodb_data_reads; /*!< I/O read requests */ + ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */ + ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */ + ulint innodb_deadlocks; + ulint innodb_history_list_length; + ulint innodb_log_waits; /*!< srv_log_waits */ + ulint innodb_log_write_requests; /*!< srv_log_write_requests */ + ulint innodb_log_writes; /*!< srv_log_writes */ + lsn_t innodb_lsn_current; + lsn_t innodb_lsn_flushed; + lsn_t innodb_lsn_last_checkpoint; + trx_id_t innodb_max_trx_id; +#ifdef BTR_CUR_HASH_ADAPT + ulint innodb_mem_adaptive_hash; +#endif + ulint innodb_mem_dictionary; + lsn_t innodb_os_log_written; /*!< srv_os_log_written */ + ulint innodb_os_log_fsyncs; /*!< n_log_flushes */ + ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ + ulint innodb_os_log_pending_fsyncs; /*!< n_pending_log_flushes */ + ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ + ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ + int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time + / 1000 */ + ulint innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time + / 1000 + / srv_n_lock_wait_count */ + ulint innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time + / 1000 */ + ulint innodb_rows_read; /*!< srv_n_rows_read */ + ulint innodb_rows_inserted; /*!< srv_n_rows_inserted */ + ulint innodb_rows_updated; /*!< srv_n_rows_updated */ + ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */ + ulint innodb_system_rows_read; /*!< srv_n_system_rows_read */ + ulint innodb_system_rows_inserted; /*!< srv_n_system_rows_inserted */ + ulint innodb_system_rows_updated; /*!< srv_n_system_rows_updated */ + ulint innodb_system_rows_deleted; /*!< srv_n_system_rows_deleted*/ + ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */ + + /** Number of undo tablespace truncation operations */ + ulong innodb_undo_truncations; + ulint innodb_defragment_compression_failures; /*!< Number of + defragment re-compression + failures */ + + ulint innodb_defragment_failures; /*!< Number of defragment + failures*/ + ulint innodb_defragment_count; /*!< Number of defragment + operations*/ + + /** Number of instant ALTER TABLE operations that affect columns */ + ulong innodb_instant_alter_column; + + ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */ + ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage + of used row log buffer */ + ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */ + + int64_t innodb_page_compression_saved;/*!< Number of bytes saved + by page compression */ + int64_t innodb_index_pages_written; /*!< Number of index pages + written */ + int64_t innodb_non_index_pages_written; /*!< Number of non index pages + written */ + int64_t innodb_pages_page_compressed;/*!< Number of pages + compressed by page compression */ + int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations + induced by page compression */ + int64_t innodb_pages_page_decompressed;/*!< Number of pages + decompressed by page + compression */ + int64_t innodb_pages_page_compression_error;/*!< Number of page + compression errors */ + int64_t innodb_pages_encrypted; /*!< Number of pages + encrypted */ + int64_t innodb_pages_decrypted; /*!< Number of pages + decrypted */ + + /*!< Number of merge blocks encrypted */ + ib_int64_t innodb_n_merge_blocks_encrypted; + /*!< Number of merge blocks decrypted */ + ib_int64_t innodb_n_merge_blocks_decrypted; + /*!< Number of row log blocks encrypted */ + ib_int64_t innodb_n_rowlog_blocks_encrypted; + /*!< Number of row log blocks decrypted */ + ib_int64_t innodb_n_rowlog_blocks_decrypted; + + /* Number of temporary tablespace pages encrypted */ + ib_int64_t innodb_n_temp_blocks_encrypted; + + /* Number of temporary tablespace pages decrypted */ + ib_int64_t innodb_n_temp_blocks_decrypted; + + ulint innodb_sec_rec_cluster_reads; /*!< srv_sec_rec_cluster_reads */ + ulint innodb_sec_rec_cluster_reads_avoided;/*!< srv_sec_rec_cluster_reads_avoided */ + + ulint innodb_encryption_rotation_pages_read_from_cache; + ulint innodb_encryption_rotation_pages_read_from_disk; + ulint innodb_encryption_rotation_pages_modified; + ulint innodb_encryption_rotation_pages_flushed; + ulint innodb_encryption_rotation_estimated_iops; + int64_t innodb_encryption_key_requests; + int64_t innodb_key_rotation_list_length; +}; + +/** Thread slot in the thread table. */ +struct srv_slot_t{ + ibool in_use; /*!< TRUE if this slot + is in use */ + /** time(NULL) when the thread was suspended. + FIXME: Use my_interval_timer() or similar, to avoid bogus + timeouts in lock_wait_check_and_cancel() or lock_wait_suspend_thread() + when the system time is adjusted to the past! + + FIXME: This is duplicating trx_lock_t::wait_started, + which is being used for diagnostic purposes only. */ + time_t suspend_time; + ulong wait_timeout; /*!< wait time that if exceeded + the thread will be timed out. + Initialized by + lock_wait_table_reserve_slot() + for lock wait */ + os_event_t event; /*!< event used in suspending + the thread when it has nothing + to do */ + que_thr_t* thr; /*!< suspended query thread + (only used for user threads) */ +}; + +extern tpool::thread_pool *srv_thread_pool; +extern std::unique_ptr<tpool::timer> srv_master_timer; +extern std::unique_ptr<tpool::timer> srv_monitor_timer; + +/** The interval at which srv_monitor_task is invoked, in milliseconds */ +constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */ + +static inline void srv_monitor_timer_schedule_now() +{ + srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL); +} +static inline void srv_start_periodic_timer(std::unique_ptr<tpool::timer>& t, + void (*func)(void*), int period) +{ + t.reset(srv_thread_pool->create_timer(func)); + t->set_time(0, period); +} + +void srv_thread_pool_init(); +void srv_thread_pool_end(); diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h new file mode 100644 index 00000000..324e3f04 --- /dev/null +++ b/storage/innobase/include/srv0start.h @@ -0,0 +1,129 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0start.h +Starts the Innobase database server + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "log0log.h" +#include "ut0byte.h" + +// Forward declaration +struct dict_table_t; + +/** Open the configured number of dedicated undo tablespaces. +@param[in] create_new_db whether the database is being initialized +@return DB_SUCCESS or error code */ +dberr_t +srv_undo_tablespaces_init(bool create_new_db); + +/** Start InnoDB. +@param[in] create_new_db whether to create a new database +@return DB_SUCCESS or error code */ +dberr_t srv_start(bool create_new_db); + +/** + Shutdown purge to make sure that there is no possibility that we call any + plugin code (e.g., audit) inside virtual column computation. +*/ +void innodb_preshutdown(); + +/** Shut down InnoDB. */ +void innodb_shutdown(); + +/** Shut down background threads that can generate undo log. */ +void srv_shutdown_bg_undo_sources(); + +/*************************************************************//** +Copy the file path component of the physical file to parameter. It will +copy up to and including the terminating path separator. +@return number of bytes copied or ULINT_UNDEFINED if destination buffer + is smaller than the path to be copied. */ +ulint +srv_path_copy( +/*==========*/ + char* dest, /*!< out: destination buffer */ + ulint dest_len, /*!< in: max bytes to copy */ + const char* basedir, /*!< in: base directory */ + const char* table_name) /*!< in: source table name */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Get the meta-data filename from the table name for a +single-table tablespace. +@param[in] table table object +@param[out] filename filename +@param[in] max_len filename max length */ +void +srv_get_meta_data_filename( + dict_table_t* table, + char* filename, + ulint max_len); + +/** Get the encryption-data filename from the table name for a +single-table tablespace. +@param[in] table table object +@param[out] filename filename +@param[in] max_len filename max length */ +void +srv_get_encryption_data_filename( + dict_table_t* table, + char* filename, + ulint max_len); + +/** Log sequence number at shutdown */ +extern lsn_t srv_shutdown_lsn; + +/** TRUE if the server is being started */ +extern bool srv_is_being_started; +/** TRUE if SYS_TABLESPACES is available for lookups */ +extern bool srv_sys_tablespaces_open; +/** TRUE if the server is being started, before rolling back any +incomplete transactions */ +extern bool srv_startup_is_before_trx_rollback_phase; + +/** TRUE if a raw partition is in use */ +extern ibool srv_start_raw_disk_in_use; + +/** Shutdown state */ +enum srv_shutdown_t { + SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */ + /** Shutdown initiated in srv_shutdown_bg_undo_sources() */ + SRV_SHUTDOWN_INITIATED, + SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in + logs_empty_and_mark_files_at_shutdown() */ + SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that + the buffer pool can be freed: flush + all file spaces and close all files */ + SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */ +}; + +/** Whether any undo log records can be generated */ +extern bool srv_undo_sources; + +/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to +SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ +extern enum srv_shutdown_t srv_shutdown_state; + +/** Files comprising the system tablespace */ +extern pfs_os_file_t files[1000]; diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h new file mode 100644 index 00000000..f9f923f9 --- /dev/null +++ b/storage/innobase/include/sync0arr.h @@ -0,0 +1,129 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0arr.h +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0arr_h +#define sync0arr_h + +#include "univ.i" + +/** Synchronization wait array cell */ +struct sync_cell_t; + +/** Synchronization wait array */ +struct sync_array_t; + +/******************************************************************//** +Get an instance of the sync wait array and reserve a wait array cell +in the instance for waiting for an object. The event of the cell is +reset to nonsignalled state. +If reserving cell of the instance fails, try to get another new +instance until we can reserve an empty cell of it. +@return the sync array found, never NULL. */ +UNIV_INLINE +sync_array_t* +sync_array_get_and_reserve_cell( + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + unsigned line, /*!< in: line where requested */ + sync_cell_t** cell); /*!< out: the cell reserved, never NULL */ +/******************************************************************//** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. */ +sync_cell_t* +sync_array_reserve_cell( + sync_array_t* arr, /*!< in: wait array */ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + unsigned line); /*!< in: line where requested */ + +/******************************************************************//** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +void +sync_array_wait_event( + sync_array_t* arr, /*!< in: wait array */ + sync_cell_t*& cell); /*!< in: the reserved cell */ + +/******************************************************************//** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +void +sync_array_free_cell( + sync_array_t* arr, /*!< in: wait array */ + sync_cell_t*& cell); /*!< in: the reserved cell */ + +/** count of how many times an object has been signalled */ +extern ulint sg_count; +#define sync_array_object_signalled() ++sg_count + +/**********************************************************************//** +Prints warnings of long semaphore waits to stderr. +@return TRUE if fatal semaphore wait threshold was exceeded */ +ibool +sync_array_print_long_waits( + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema); /*!< out: longest-waited-for semaphore */ + +/**********************************************************************//** +Prints info of the wait array. */ +void +sync_array_print( + FILE* file); /*!< in: file where to print */ + +/** Create the primary system wait arrays */ +void sync_array_init(); + +/** Destroy the sync array wait sub-system. */ +void sync_array_close(); + +/**********************************************************************//** +Get an instance of the sync wait array. */ +UNIV_INLINE +sync_array_t* +sync_array_get(); +/**********************************************************************//** +Prints info of the wait array without using any mutexes/semaphores. */ +UNIV_INTERN +void +sync_array_print_innodb(void); + +/*****************************************************************//** +Gets the nth cell in array. +@return cell */ +UNIV_INTERN +sync_cell_t* +sync_array_get_nth_cell( +/*====================*/ + sync_array_t* arr, /*!< in: sync array */ + ulint n); /*!< in: index */ + +#include "sync0arr.ic" + +#endif /* sync0arr_h */ diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic new file mode 100644 index 00000000..962226b4 --- /dev/null +++ b/storage/innobase/include/sync0arr.ic @@ -0,0 +1,85 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0arr.ic +The wait array for synchronization primitives + +Inline code + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +extern ulint sync_array_size; +extern sync_array_t** sync_wait_array; + +#include "ut0counter.h" + +/**********************************************************************//** +Get an instance of the sync wait array. +@return an instance of the sync wait array. */ + +UNIV_INLINE +sync_array_t* +sync_array_get() +/*============*/ +{ + if (sync_array_size <= 1) { + return(sync_wait_array[0]); + } + + return(sync_wait_array[get_rnd_value() % sync_array_size]); +} + +/******************************************************************//** +Get an instance of the sync wait array and reserve a wait array cell +in the instance for waiting for an object. The event of the cell is +reset to nonsignalled state. +If reserving cell of the instance fails, try to get another new +instance until we can reserve an empty cell of it. +@return the sync array reserved, never NULL. */ +UNIV_INLINE +sync_array_t* +sync_array_get_and_reserve_cell( +/*============================*/ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + unsigned line, /*!< in: line where requested */ + sync_cell_t** cell) /*!< out: the cell reserved, never NULL */ +{ + sync_array_t* sync_arr = NULL; + + *cell = NULL; + for (ulint i = 0; i < sync_array_size && *cell == NULL; ++i) { + /* Although the sync_array is get in a random way currently, + we still try at most sync_array_size times, in case any + of the sync_array we get is full */ + sync_arr = sync_array_get(); + *cell = sync_array_reserve_cell(sync_arr, object, type, + file, line); + } + + /* This won't be true every time, for the loop above may execute + more than srv_sync_array_size times to reserve a cell. + But an assertion here makes the code more solid. */ + ut_a(*cell != NULL); + + return(sync_arr); +} diff --git a/storage/innobase/include/sync0debug.h b/storage/innobase/include/sync0debug.h new file mode 100644 index 00000000..07e98546 --- /dev/null +++ b/storage/innobase/include/sync0debug.h @@ -0,0 +1,101 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0debug.h +Debug checks for latches, header file + +Created 2012-08-21 Sunny Bains +*******************************************************/ + +#ifndef sync0debug_h +#define sync0debug_h + +#include "univ.i" + +/** Initializes the synchronization data structures. */ +void +sync_check_init(); + +/** Free the InnoDB synchronization data structures. */ +void +sync_check_close(); + +#ifdef UNIV_DEBUG +/** Check if it is OK to acquire the latch. +@param[in] latch latch type */ +void +sync_check_lock_validate(const latch_t* latch); + +/** Note that the lock has been granted +@param[in] latch latch type */ +void +sync_check_lock_granted(const latch_t* latch); + +/** Check if it is OK to acquire the latch. +@param[in] latch latch type +@param[in] level the level of the mutex */ +void +sync_check_lock(const latch_t* latch, latch_level_t level); + +/** +Check if it is OK to re-acquire the lock. */ +void +sync_check_relock(const latch_t* latch); + +/** Removes a latch from the thread level array if it is found there. +@param[in] latch to unlock */ +void +sync_check_unlock(const latch_t* latch); + +/** Checks if the level array for the current thread contains a +mutex or rw-latch at the specified level. +@param[in] level to find +@return a matching latch, or NULL if not found */ +const latch_t* +sync_check_find(latch_level_t level); + +/** Checks that the level array for the current thread is empty. +Terminate iteration if the functor returns true. +@param[in] functor called for each element. +@return true if the functor returns true for any element */ +bool +sync_check_iterate(const sync_check_functor_t& functor); + +/** Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +void +rw_lock_debug_mutex_enter(); + +/** Releases the debug mutex. */ +void +rw_lock_debug_mutex_exit(); + +#endif /* UNIV_DEBUG */ + +#endif /* !sync0debug_h */ diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h new file mode 100644 index 00000000..68397827 --- /dev/null +++ b/storage/innobase/include/sync0policy.h @@ -0,0 +1,296 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/sync0policy.h +Policies for mutexes. + +Created 2012-08-21 Sunny Bains. +***********************************************************************/ + +#ifndef sync0policy_h +#define sync0policy_h + +#include "ut0rnd.h" +#include "os0thread.h" +#include "srv0mon.h" +#include "sync0debug.h" + +#ifdef UNIV_DEBUG + +template <typename Mutex> class MutexDebug: public latch_t +{ + /** Mutex to check for lock order violation */ + const Mutex *m_mutex; + /** Filename from where enter was called */ + const char *m_filename; + /** Line mumber in filename */ + unsigned m_line; + /** Thread ID of the thread that owns the mutex */ + os_thread_id_t m_thread_id; + /** Mutex protecting the above members */ + mutable OSMutex m_debug_mutex; + + + void set(const Mutex *mutex, const char *filename, unsigned line, + os_thread_id_t thread_id) + { + m_debug_mutex.enter(); + m_mutex= mutex; + m_filename= filename; + m_line= line; + m_thread_id= thread_id; + m_debug_mutex.exit(); + } + + + const MutexDebug get() const + { + MutexDebug ret; + m_debug_mutex.enter(); + ret.m_mutex= m_mutex; + ret.m_filename= m_filename; + ret.m_line= m_line; + ret.m_thread_id= m_thread_id; + m_debug_mutex.exit(); + return ret; + } + + + /** + Called either when mutex is locked or destroyed. Thus members are protected + from concurrent modification. + */ + void assert_clean_context() + { + ut_ad(!m_mutex); + ut_ad(!m_filename); + ut_ad(!m_line); + ut_ad(m_thread_id == os_thread_id_t(ULINT_UNDEFINED)); + } + + +public: + /** + Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. + @param[in] id Mutex ID + */ + void init(latch_id_t id) + { + ut_ad(id != LATCH_ID_NONE); + m_id= id; + m_debug_mutex.init(); + set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + } + + + /** Mutex is being destroyed. */ + void destroy() + { + assert_clean_context(); + m_debug_mutex.destroy(); + } + + + /** + Called when an attempt is made to lock the mutex + @param[in] mutex Mutex instance to be locked + @param[in] filename Filename from where it was called + @param[in] line Line number from where it was called + */ + void enter(const Mutex &mutex, const char *filename, unsigned line) + { + MutexDebug context; + ut_ad(!is_owned()); + context.init(m_id); + context.set(&mutex, filename, line, os_thread_get_curr_id()); + /* Check for latch order violation. */ + sync_check_lock_validate(&context); + context.set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + context.destroy(); + } + + + /** + Called when the mutex is locked + @param[in] mutex Mutex instance that was locked + @param[in] filename Filename from where it was called + @param[in] line Line number from where it was called + */ + void locked(const Mutex &mutex, const char *filename, unsigned line) + { + assert_clean_context(); + set(&mutex, filename, line, os_thread_get_curr_id()); + sync_check_lock_granted(this); + } + + + /** + Called when the mutex is released + @param[in] mutex Mutex that was released + */ + void release(const Mutex &mutex) + { + ut_ad(is_owned()); + set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + sync_check_unlock(this); + } + + + /** @return true if thread owns the mutex */ + bool is_owned() const + { + return os_thread_eq(get_thread_id(), os_thread_get_curr_id()); + } + + + /** @return the name of the file from the mutex was acquired */ + const char* get_enter_filename() const { return get().m_filename; } + + + /** @return the name of the file from the mutex was acquired */ + unsigned get_enter_line() const { return get().m_line; } + + + /** @return id of the thread that was trying to acquire the mutex */ + os_thread_id_t get_thread_id() const { return get().m_thread_id; } + + + /** + Print information about the latch + @return the string representation + */ + virtual std::string to_string() const + { + std::ostringstream msg; + const MutexDebug ctx= get(); + + msg << m_mutex->policy().to_string(); + if (ctx.m_mutex) + msg << " addr: " << ctx.m_mutex << " acquired: " + << sync_basename(ctx.get_enter_filename()) << ":" + << ctx.get_enter_line(); + else + msg << "Not locked"; + + return(msg.str()); + } +}; +#endif /* UNIV_DEBUG */ + +/** Collect the metrics per mutex instance, no aggregation. */ +template <typename Mutex> +struct GenericPolicy +{ +public: + /** Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. + @param[in] id Mutex ID + @param[in] filename File where mutex was created + @param[in] line Line in filename */ + void init( + const Mutex&, + latch_id_t id, + const char* filename, + uint32_t line) + UNIV_NOTHROW + { + m_id = id; + + latch_meta_t& meta = sync_latch_get_meta(id); + + ut_ad(meta.get_id() == id); + + meta.get_counter()->single_register(&m_count); + + m_filename = filename; + m_line = line; + } + + /** Called when the mutex is destroyed. */ + void destroy() + UNIV_NOTHROW + { + latch_meta_t& meta = sync_latch_get_meta(m_id); + + meta.get_counter()->single_deregister(&m_count); + } + + /** Called after a successful mutex acquire. + @param[in] n_spins Number of times the thread did + spins while trying to acquire the mutex + @param[in] n_waits Number of times the thread waited + in some type of OS queue */ + void add( + uint32_t n_spins, + uint32_t n_waits) + UNIV_NOTHROW + { + /* Currently global on/off. Keeps things simple and fast */ + + if (!m_count.m_enabled) { + + return; + } + + m_count.m_spins += n_spins; + m_count.m_waits += n_waits; + + ++m_count.m_calls; + } + + /** Print the information about the latch + @return the string representation */ + std::string print() const + UNIV_NOTHROW; + + /** @return the latch ID */ + latch_id_t get_id() const + UNIV_NOTHROW + { + return(m_id); + } + + + /** @return the string representation */ + std::string to_string() const + { + return sync_mutex_to_string(get_id(), + std::string(m_filename) + .append(":") + .append(std::to_string(m_line))); + } + +#ifdef UNIV_DEBUG + MutexDebug<Mutex> context; +#endif + +private: + const char *m_filename; + uint32_t m_line; + + /** The user visible counters, registered with the meta-data. */ + latch_meta_t::CounterType::Count m_count; + + /** Latch meta data ID */ + latch_id_t m_id; +}; + +#endif /* sync0policy_h */ diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h new file mode 100644 index 00000000..084acc51 --- /dev/null +++ b/storage/innobase/include/sync0rw.h @@ -0,0 +1,838 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2017, 2020, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0rw.h +The read-write lock (for threads, not for database transactions) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0rw_h +#define sync0rw_h + +#include "os0event.h" +#include "ut0mutex.h" +#include "ilist.h" + +/** Counters for RW locks. */ +struct rw_lock_stats_t { + typedef ib_counter_t<int64_t, IB_N_SLOTS> int64_counter_t; + + /** number of spin waits on rw-latches, + resulted during shared (read) locks */ + int64_counter_t rw_s_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during shared (read) locks */ + int64_counter_t rw_s_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during shared (read) locks */ + int64_counter_t rw_s_os_wait_count; + + /** number of spin waits on rw-latches, + resulted during exclusive (write) locks */ + int64_counter_t rw_x_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during exclusive (write) locks */ + int64_counter_t rw_x_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during exclusive (write) locks */ + int64_counter_t rw_x_os_wait_count; + + /** number of spin waits on rw-latches, + resulted during sx locks */ + int64_counter_t rw_sx_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during sx locks */ + int64_counter_t rw_sx_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during sx locks */ + int64_counter_t rw_sx_os_wait_count; +}; + +/* Latch types; these are used also in btr0btr.h and mtr0mtr.h: keep the +numerical values smaller than 30 (smaller than BTR_MODIFY_TREE and +MTR_MEMO_MODIFY) and the order of the numerical values like below! and they +should be 2pow value to be used also as ORed combination of flag. */ +enum rw_lock_type_t { + RW_S_LATCH = 1, + RW_X_LATCH = 2, + RW_SX_LATCH = 4, + RW_NO_LATCH = 8 +}; + +/* We decrement lock_word by X_LOCK_DECR for each x_lock. It is also the +start value for the lock_word, meaning that it limits the maximum number +of concurrent read locks before the rw_lock breaks. */ +/* We decrement lock_word by X_LOCK_HALF_DECR for sx_lock. */ +#define X_LOCK_DECR 0x20000000 +#define X_LOCK_HALF_DECR 0x10000000 + +#ifdef rw_lock_t +#undef rw_lock_t +#endif +struct rw_lock_t; + +#ifdef UNIV_DEBUG +struct rw_lock_debug_t; +#endif /* UNIV_DEBUG */ + +extern ilist<rw_lock_t> rw_lock_list; +extern ib_mutex_t rw_lock_list_mutex; + +/** Counters for RW locks. */ +extern rw_lock_stats_t rw_lock_stats; + +#ifndef UNIV_PFS_RWLOCK +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. +if MySQL performance schema is enabled and "UNIV_PFS_RWLOCK" is +defined, the rwlock are instrumented with performance schema probes. */ +# ifdef UNIV_DEBUG +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), (level), __FILE__, __LINE__) +# else /* UNIV_DEBUG */ +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), __FILE__, __LINE__) +# endif /* UNIV_DEBUG */ + +/**************************************************************//** +NOTE! The following macros should be used in rw locking and +unlocking, not the corresponding function. */ + +# define rw_lock_s_lock(M) \ + rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_s_lock_inline(M, P, F, L) \ + rw_lock_s_lock_func((M), (P), (F), (L)) + +# define rw_lock_s_lock_gen(M, P) \ + rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_nowait(M, F, L) \ + rw_lock_s_lock_low((M), 0, (F), (L)) + +# ifdef UNIV_DEBUG +# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L) +# else +# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L) +# endif /* UNIV_DEBUG */ + +#define rw_lock_sx_lock(L) \ + rw_lock_sx_lock_func((L), 0, __FILE__, __LINE__) + +#define rw_lock_sx_lock_inline(M, P, F, L) \ + rw_lock_sx_lock_func((M), (P), (F), (L)) + +#define rw_lock_sx_lock_gen(M, P) \ + rw_lock_sx_lock_func((M), (P), __FILE__, __LINE__) + +#define rw_lock_sx_lock_nowait(M, P) \ + rw_lock_sx_lock_low((M), (P), __FILE__, __LINE__) + +#define rw_lock_sx_lock(L) \ + rw_lock_sx_lock_func((L), 0, __FILE__, __LINE__) + +#define rw_lock_sx_lock_inline(M, P, F, L) \ + rw_lock_sx_lock_func((M), (P), (F), (L)) + +#define rw_lock_sx_lock_gen(M, P) \ + rw_lock_sx_lock_func((M), (P), __FILE__, __LINE__) + +#define rw_lock_sx_lock_nowait(M, P) \ + rw_lock_sx_lock_low((M), (P), __FILE__, __LINE__) + +# ifdef UNIV_DEBUG +# define rw_lock_sx_unlock(L) rw_lock_sx_unlock_func(0, L) +# define rw_lock_sx_unlock_gen(L, P) rw_lock_sx_unlock_func(P, L) +# else /* UNIV_DEBUG */ +# define rw_lock_sx_unlock(L) rw_lock_sx_unlock_func(L) +# define rw_lock_sx_unlock_gen(L, P) rw_lock_sx_unlock_func(L) +# endif /* UNIV_DEBUG */ + +# define rw_lock_x_lock(M) \ + rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_x_lock_inline(M, P, F, L) \ + rw_lock_x_lock_func((M), (P), (F), (L)) + +# define rw_lock_x_lock_gen(M, P) \ + rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_x_lock_nowait(M) \ + rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) + +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + rw_lock_x_lock_func_nowait((M), (F), (L)) + +# ifdef UNIV_DEBUG +# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L) +# else +# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L) +# endif + +# define rw_lock_free(M) rw_lock_free_func(M) + +#else /* !UNIV_PFS_RWLOCK */ + +/* Following macros point to Performance Schema instrumented functions. */ +# ifdef UNIV_DEBUG +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), (level), __FILE__, __LINE__) +# else /* UNIV_DEBUG */ +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__) +# endif /* UNIV_DEBUG */ + +/****************************************************************** +NOTE! The following macros should be used in rw locking and +unlocking, not the corresponding function. */ + +# define rw_lock_s_lock(M) \ + pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_s_lock_inline(M, P, F, L) \ + pfs_rw_lock_s_lock_func((M), (P), (F), (L)) + +# define rw_lock_s_lock_gen(M, P) \ + pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_nowait(M, F, L) \ + pfs_rw_lock_s_lock_low((M), 0, (F), (L)) + +# ifdef UNIV_DEBUG +# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(P, L) +# else +# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(L) +# endif + +# define rw_lock_sx_lock(M) \ + pfs_rw_lock_sx_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_sx_lock_inline(M, P, F, L) \ + pfs_rw_lock_sx_lock_func((M), (P), (F), (L)) + +# define rw_lock_sx_lock_gen(M, P) \ + pfs_rw_lock_sx_lock_func((M), (P), __FILE__, __LINE__) + +#define rw_lock_sx_lock_nowait(M, P) \ + pfs_rw_lock_sx_lock_low((M), (P), __FILE__, __LINE__) + +# ifdef UNIV_DEBUG +# define rw_lock_sx_unlock(L) pfs_rw_lock_sx_unlock_func(0, L) +# define rw_lock_sx_unlock_gen(L, P) pfs_rw_lock_sx_unlock_func(P, L) +# else +# define rw_lock_sx_unlock(L) pfs_rw_lock_sx_unlock_func(L) +# define rw_lock_sx_unlock_gen(L, P) pfs_rw_lock_sx_unlock_func(L) +# endif + +# define rw_lock_x_lock(M) \ + pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_x_lock_inline(M, P, F, L) \ + pfs_rw_lock_x_lock_func((M), (P), (F), (L)) + +# define rw_lock_x_lock_gen(M, P) \ + pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_x_lock_nowait(M) \ + pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) + +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + pfs_rw_lock_x_lock_func_nowait((M), (F), (L)) + +# ifdef UNIV_DEBUG +# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(P, L) +# else +# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(L) +# endif + +# define rw_lock_free(M) pfs_rw_lock_free_func(M) + +#endif /* !UNIV_PFS_RWLOCK */ + +#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0) +#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0) + +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG + latch_level_t level, /*!< in: level */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + unsigned cline); /*!< in: file line where created */ +/******************************************************************//** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +void +rw_lock_free_func( +/*==============*/ + rw_lock_t* lock); /*!< in/out: rw-lock */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. +@return true */ +bool +rw_lock_validate( +/*=============*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +#endif /* UNIV_DEBUG */ +/******************************************************************//** +Low-level function which tries to lock an rw-lock in s-mode. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass MY_ATTRIBUTE((unused)), + /*!< in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function, except if +you supply the file name and line number. Lock an rw-lock in shared mode +for the current thread. If the rw-lock is locked in exclusive mode, or +there is an exclusive lock request waiting, the function spins a preset +time (controlled by srv_n_spin_wait_rounds), waiting for the lock, before +suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock); /*!< in/out: rw-lock */ + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by srv_n_spin_wait_rounds), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Low-level function for acquiring an sx lock. +@return FALSE if did not succeed, TRUE if success. */ +ibool +rw_lock_sx_lock_low( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in SX mode for the current thread. If the rw-lock is locked +in exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single sx-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +void +rw_lock_sx_lock_func( +/*=================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock); /*!< in/out: rw-lock */ + +/******************************************************************//** +Releases an sx mode lock. */ +UNIV_INLINE +void +rw_lock_sx_unlock_func( +/*===================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock); /*!< in/out: rw-lock */ + +/******************************************************************//** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock); /*!< in: lock which was x-locked in the + buffer read */ +/******************************************************************//** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the number of sx-lock for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_sx_lock_count( +/*======================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_X, RW_LOCK_X_WAIT, RW_LOCK_SX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the number of readers (s-locks). +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Decrements lock_word the specified amount if it is greater than 0. +This is used by both s_lock and x_lock operations. +@return true if decr occurs */ +UNIV_INLINE +bool +rw_lock_lock_word_decr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + int32_t amount, /*!< in: amount to decrement */ + int32_t threshold); /*!< in: threshold of judgement */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +bool +rw_lock_own( +/*========*/ + const rw_lock_t*lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_S, + RW_LOCK_X */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************************//** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +bool +rw_lock_own_flagged( +/*================*/ + const rw_lock_t* lock, /*!< in: rw-lock */ + rw_lock_flags_t flags) /*!< in: specify lock types with + OR of the rw_lock_flag_t values */ + MY_ATTRIBUTE((warn_unused_result)); +#endif /* UNIV_DEBUG */ +/******************************************************************//** +Checks if somebody has locked the rw-lock in the specified mode. +@return true if locked */ +bool +rw_lock_is_locked( +/*==============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type); /*!< in: lock type: RW_LOCK_S, + RW_LOCK_X or RW_LOCK_SX */ +#ifdef UNIV_DEBUG +/***************************************************************//** +Prints debug info of currently locked rw-locks. */ +void +rw_lock_list_print_info( +/*====================*/ + FILE* file); /*!< in: file where to print */ + +/*#####################################################################*/ + +/*********************************************************************//** +Prints info of a debug struct. */ +void +rw_lock_debug_print( +/*================*/ + FILE* f, /*!< in: output stream */ + const rw_lock_debug_t* info); /*!< in: debug struct */ +#endif /* UNIV_DEBUG */ + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! */ + +/** The structure used in the spin lock implementation of a read-write +lock. Several threads may have a shared lock simultaneously in this +lock, but only one writer may have an exclusive lock, in which case no +shared locks are allowed. To prevent starving of a writer blocked by +readers, a writer may queue for x-lock by decrementing lock_word: no +new readers will be let in while the thread waits for readers to +exit. */ + +struct rw_lock_t : +#ifdef UNIV_DEBUG + public latch_t, +#endif /* UNIV_DEBUG */ + public ilist_node<> +{ + ut_d(bool created= false;) + + /** Holds the state of the lock. */ + Atomic_relaxed<int32_t> lock_word; + + /** 0=no waiters, 1=waiters for X or SX lock exist */ + Atomic_relaxed<uint32_t> waiters; + + /** number of granted SX locks. */ + volatile ulint sx_recursive; + + /** The value is typically set to thread id of a writer thread making + normal rw_locks recursive. In case of asynchronous IO, when a non-zero + value of 'pass' is passed then we keep the lock non-recursive. + + writer_thread must be reset in x_unlock functions before incrementing + the lock_word. */ + volatile os_thread_id_t writer_thread; + + /** Used by sync0arr.cc for thread queueing */ + os_event_t event; + + /** Event for next-writer to wait on. A thread must decrement + lock_word before waiting. */ + os_event_t wait_ex_event; + + /** File name where lock created */ + const char* cfile_name; + + /** File name where last x-locked */ + const char* last_x_file_name; + + /** Line where created */ + unsigned cline:13; + + /** If 1 then the rw-lock is a block lock */ + unsigned is_block_lock:1; + + /** Line number where last time x-locked */ + unsigned last_x_line:14; + + /** Count of os_waits. May not be accurate */ + uint32_t count_os_wait; + +#ifdef UNIV_PFS_RWLOCK + /** The instrumentation hook */ + struct PSI_rwlock* pfs_psi; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_DEBUG + std::string to_string() const override; + + /** In the debug version: pointer to the debug info list of the lock */ + UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; + + /** Level in the global latching order. */ + latch_level_t level; +#endif /* UNIV_DEBUG */ +}; +#ifdef UNIV_DEBUG +/** The structure for storing debug info of an rw-lock. All access to this +structure must be protected by rw_lock_debug_mutex_enter(). */ +struct rw_lock_debug_t { + + os_thread_id_t thread_id; /*!< The thread id of the thread which + locked the rw-lock */ + ulint pass; /*!< Pass value given in the lock operation */ + ulint lock_type; /*!< Type of the lock: RW_LOCK_X, + RW_LOCK_S, RW_LOCK_X_WAIT */ + const char* file_name;/*!< File name where the lock was obtained */ + unsigned line; /*!< Line where the rw-lock was locked */ + UT_LIST_NODE_T(rw_lock_debug_t) list; + /*!< Debug structs are linked in a two-way + list */ +}; +#endif /* UNIV_DEBUG */ + +/* For performance schema instrumentation, a new set of rwlock +wrap functions are created if "UNIV_PFS_RWLOCK" is defined. +The instrumentations are not planted directly into original +functions, so that we keep the underlying function as they +are. And in case, user wants to "take out" some rwlock from +instrumentation even if performance schema (UNIV_PFS_RWLOCK) +is defined, they can do so by reinstating APIs directly link to +original underlying functions. +The instrumented function names have prefix of "pfs_rw_lock_" vs. +original name prefix of "rw_lock_". Following are list of functions +that have been instrumented: + +rw_lock_create() +rw_lock_x_lock() +rw_lock_x_lock_gen() +rw_lock_x_lock_nowait() +rw_lock_x_unlock_gen() +rw_lock_s_lock() +rw_lock_s_lock_gen() +rw_lock_s_lock_nowait() +rw_lock_s_unlock_gen() +rw_lock_sx_lock() +rw_lock_sx_unlock_gen() +rw_lock_free() +*/ + +#ifdef UNIV_PFS_RWLOCK +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func() +NOTE! Please use the corresponding macro rw_lock_create(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + PSI_rwlock_key key, /*!< in: key registered with + performance schema */ + rw_lock_t* lock, /*!< in: rw lock */ +#ifdef UNIV_DEBUG + latch_level_t level, /*!< in: level */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + unsigned cline); /*!< in: file line where created */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for +rw_lock_x_lock_func_nowait() +NOTE! Please use the corresponding macro, not directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_x_lock_func_nowait( +/*===========================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_sx_lock_func() +NOTE! Please use the corresponding macro rw_lock_sx_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_sx_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_sx_lock_nowait() +NOTE! Please use the corresponding macro, not directly +this function! */ +UNIV_INLINE +ibool +pfs_rw_lock_sx_lock_low( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_sx_unlock_func() +NOTE! Please use the corresponding macro rw_lock_sx_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_sx_unlock_func( +/*======================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + rw_lock_t* lock); /*!< in: rw-lock */ +#endif /* UNIV_PFS_RWLOCK */ + +#include "sync0rw.ic" + +#endif /* sync0rw.h */ diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic new file mode 100644 index 00000000..169cbdd9 --- /dev/null +++ b/storage/innobase/include/sync0rw.ic @@ -0,0 +1,842 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2017, 2020, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0rw.ic +The read-write lock (for threads) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#include "os0event.h" + +/******************************************************************//** +Lock an rw-lock in shared mode for the current thread. If the rw-lock is +locked in exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by srv_n_spin_wait_rounds), +waiting for the lock before suspending the thread. */ +void +rw_lock_s_lock_spin( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line); /*!< in: line where requested */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Inserts the debug information for an rw-lock. */ +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type, /*!< in: lock type */ + const char* file_name, /*!< in: file where requested */ + unsigned line); /*!< in: line where requested */ +/******************************************************************//** +Removes a debug information struct for an rw-lock. */ +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type); /*!< in: lock type */ +#endif /* UNIV_DEBUG */ + +/******************************************************************//** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_X, RW_LOCK_X_WAIT, RW_LOCK_SX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + int32_t lock_word = lock->lock_word; + + ut_ad(lock_word <= X_LOCK_DECR); + if (lock_word > X_LOCK_HALF_DECR) { + /* return NOT_LOCKED in s-lock state, like the writer + member of the old lock implementation. */ + return(RW_LOCK_NOT_LOCKED); + } else if (lock_word > 0) { + /* sx-locked, no x-locks */ + return(RW_LOCK_SX); + } else if (lock_word == 0 + || lock_word == -X_LOCK_HALF_DECR + || lock_word <= -X_LOCK_DECR) { + /* x-lock with sx-lock is also treated as RW_LOCK_EX */ + return(RW_LOCK_X); + } else { + /* x-waiter with sx-lock is also treated as RW_LOCK_WAIT_EX + e.g. -X_LOCK_HALF_DECR < lock_word < 0 : without sx + -X_LOCK_DECR < lock_word < -X_LOCK_HALF_DECR : with sx */ + return(RW_LOCK_X_WAIT); + } +} + +/******************************************************************//** +Returns the number of readers (s-locks). +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + int32_t lock_word = lock->lock_word; + ut_ad(lock_word <= X_LOCK_DECR); + + if (lock_word > X_LOCK_HALF_DECR) { + /* s-locked, no x-waiter */ + return ulint(X_LOCK_DECR - lock_word); + } else if (lock_word > 0) { + /* s-locked, with sx-locks only */ + return ulint(X_LOCK_HALF_DECR - lock_word); + } else if (lock_word == 0) { + /* x-locked */ + return(0); + } else if (lock_word > -X_LOCK_HALF_DECR) { + /* s-locked, with x-waiter */ + return((ulint)(-lock_word)); + } else if (lock_word == -X_LOCK_HALF_DECR) { + /* x-locked with sx-locks */ + return(0); + } else if (lock_word > -X_LOCK_DECR) { + /* s-locked, with x-waiter and sx-lock */ + return((ulint)(-(lock_word + X_LOCK_HALF_DECR))); + } + /* no s-locks */ + return(0); +} + +/******************************************************************//** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + int32_t lock_copy = lock->lock_word; + ut_ad(lock_copy <= X_LOCK_DECR); + + if (lock_copy == 0 || lock_copy == -X_LOCK_HALF_DECR) { + /* "1 x-lock" or "1 x-lock + sx-locks" */ + return(1); + } else if (lock_copy > -X_LOCK_DECR) { + /* s-locks, one or more sx-locks if > 0, or x-waiter if < 0 */ + return(0); + } else if (lock_copy > -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { + /* no s-lock, no sx-lock, 2 or more x-locks. + First 2 x-locks are set with -X_LOCK_DECR, + all other recursive x-locks are set with -1 */ + return ulint(2 - X_LOCK_DECR - lock_copy); + } else { + /* no s-lock, 1 or more sx-lock, 2 or more x-locks. + First 2 x-locks are set with -(X_LOCK_DECR + X_LOCK_HALF_DECR), + all other recursive x-locks are set with -1 */ + return ulint(2 - X_LOCK_DECR - X_LOCK_HALF_DECR - lock_copy); + } +} + +/******************************************************************//** +Returns the number of sx-lock for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of sx-lock count */ +UNIV_INLINE +ulint +rw_lock_get_sx_lock_count( +/*======================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ +#ifdef UNIV_DEBUG + int32_t lock_copy = lock->lock_word; + + ut_ad(lock_copy <= X_LOCK_DECR); + + while (lock_copy < 0) { + lock_copy += X_LOCK_DECR; + } + + if (lock_copy > 0 && lock_copy <= X_LOCK_HALF_DECR) { + return(lock->sx_recursive); + } + + return(0); +#else /* UNIV_DEBUG */ + return(lock->sx_recursive); +#endif /* UNIV_DEBUG */ +} + +/******************************************************************//** +Recursive x-locks are not supported: they should be handled by the caller and +need not be atomic since they are performed by the current lock holder. +Returns true if the decrement was made, false if not. +@return true if decr occurs */ +UNIV_INLINE +bool +rw_lock_lock_word_decr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + int32_t amount, /*!< in: amount to decrement */ + int32_t threshold) /*!< in: threshold of judgement */ +{ + int32_t lock_copy = lock->lock_word; + + while (lock_copy > threshold) { + if (lock->lock_word.compare_exchange_strong( + lock_copy, + lock_copy - amount, + std::memory_order_acquire, + std::memory_order_relaxed)) { + + return(true); + } + + /* Note that lock_copy was reloaded above. We will + keep trying if a spurious conflict occurred, typically + caused by concurrent executions of + rw_lock_s_lock(). */ + + /* Note: unlike this implementation, rw_lock::read_lock() + allows concurrent calls without a spin loop */ + } + + /* A real conflict was detected. */ + return(false); +} + +/******************************************************************//** +Low-level function which tries to lock an rw-lock in s-mode. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass MY_ATTRIBUTE((unused)), + /*!< in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + unsigned line) /*!< in: line where requested */ +{ + if (!rw_lock_lock_word_decr(lock, 1, 0)) { + /* Locking did not succeed */ + return(FALSE); + } + + ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_S, file_name, line)); + + return(TRUE); /* locking succeeded */ +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in shared mode for the current thread. If the rw-lock is locked +in exclusive mode, or there is an exclusive lock request waiting, the +function spins a preset time (controlled by srv_n_spin_wait_rounds), waiting for +the lock, before suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line) /*!< in: line where requested */ +{ + /* NOTE: As we do not know the thread ids for threads which have + s-locked a latch, and s-lockers will be served only after waiting + x-lock requests have been fulfilled, then if this thread already + owns an s-lock here, it may end up in a deadlock with another thread + which requests an x-lock here. Therefore, we will forbid recursive + s-locking of a latch: the following assert will warn the programmer + of the possibility of this kind of a deadlock. If we want to implement + safe recursive s-locking, we should keep in a list the thread ids of + the threads which have s-locked a latch. This would use some CPU + time. */ + + ut_ad(!rw_lock_own_flagged(lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + + if (!rw_lock_s_lock_low(lock, pass, file_name, line)) { + + /* Did not succeed, try spin wait */ + + rw_lock_s_lock_spin(lock, pass, file_name, line); + } +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line) /*!< in: line where requested */ +{ + int32_t oldval = X_LOCK_DECR; + + if (lock->lock_word.compare_exchange_strong(oldval, 0, + std::memory_order_acquire, + std::memory_order_relaxed)) { + lock->writer_thread = os_thread_get_curr_id(); + + } else if (os_thread_eq(lock->writer_thread, os_thread_get_curr_id())) { + /* Relock: even though no other thread can modify (lock, unlock + or reserve) lock_word while there is an exclusive writer and + this is the writer thread, we still want concurrent threads to + observe consistent values. */ + if (oldval == 0 || oldval == -X_LOCK_HALF_DECR) { + /* There are 1 x-locks */ + lock->lock_word.fetch_sub(X_LOCK_DECR, + std::memory_order_relaxed); + } else if (oldval <= -X_LOCK_DECR) { + /* There are 2 or more x-locks */ + lock->lock_word.fetch_sub(1, + std::memory_order_relaxed); + /* Watch for too many recursive locks */ + ut_ad(oldval < 1); + } else { + /* Failure */ + return(FALSE); + } + } else { + /* Failure */ + return(FALSE); + } + + ut_d(rw_lock_add_debug_info(lock, 0, RW_LOCK_X, file_name, line)); + + lock->last_x_file_name = file_name; + lock->last_x_line = line & ((1 << 14) - 1); + + ut_ad(rw_lock_validate(lock)); + + return(TRUE); +} + +/******************************************************************//** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_S)); + + /* Increment lock_word to indicate 1 less reader */ + int32_t lock_word = lock->lock_word.fetch_add( + 1, std::memory_order_release); + + if (lock_word == -1 || lock_word == -X_LOCK_HALF_DECR - 1) { + /* wait_ex waiter exists. It may not be asleep, but we signal + anyway. We do not wake other waiters, because they can't + exist without wait_ex waiter and wait_ex waiter goes first.*/ + os_event_set(lock->wait_ex_event); + sync_array_object_signalled(); + } else { + ut_ad(lock_word > -X_LOCK_DECR); + ut_ad(lock_word < X_LOCK_DECR); + } + + ut_ad(rw_lock_validate(lock)); +} + +/******************************************************************//** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + int32_t lock_word = lock->lock_word; + + if (lock_word == 0) { + /* Last caller in a possible recursive chain. */ + lock->writer_thread = 0; + } + + ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_X)); + + if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { + /* Last X-lock owned by this thread, it may still hold SX-locks. + ACQ_REL due to... + RELEASE: we release rw-lock + ACQUIRE: we want waiters to be loaded after lock_word is stored */ + lock->lock_word.fetch_add(X_LOCK_DECR, + std::memory_order_acq_rel); + + /* This no longer has an X-lock but it may still have + an SX-lock. So it is now free for S-locks by other threads. + We need to signal read/write waiters. + We do not need to signal wait_ex waiters, since they cannot + exist when there is a writer. */ + if (lock->waiters) { + lock->waiters = 0; + os_event_set(lock->event); + sync_array_object_signalled(); + } + } else if (lock_word == -X_LOCK_DECR + || lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { + /* There are 2 x-locks */ + lock->lock_word.fetch_add(X_LOCK_DECR); + } else { + /* There are more than 2 x-locks. */ + ut_ad(lock_word < -X_LOCK_DECR); + lock->lock_word.fetch_add(1); + } + + ut_ad(rw_lock_validate(lock)); +} + +/******************************************************************//** +Releases a sx mode lock. */ +UNIV_INLINE +void +rw_lock_sx_unlock_func( +/*===================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ut_ad(rw_lock_get_sx_lock_count(lock)); + ut_ad(lock->sx_recursive > 0); + + --lock->sx_recursive; + + ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_SX)); + + if (lock->sx_recursive == 0) { + int32_t lock_word = lock->lock_word; + /* Last caller in a possible recursive chain. */ + if (lock_word > 0) { + lock->writer_thread = 0; + ut_ad(lock_word <= INT_MAX32 - X_LOCK_HALF_DECR); + + /* Last SX-lock owned by this thread, doesn't own X-lock. + ACQ_REL due to... + RELEASE: we release rw-lock + ACQUIRE: we want waiters to be loaded after lock_word is stored */ + lock->lock_word.fetch_add(X_LOCK_HALF_DECR, + std::memory_order_acq_rel); + + /* Lock is now free. May have to signal read/write + waiters. We do not need to signal wait_ex waiters, + since they cannot exist when there is an sx-lock + holder. */ + if (lock->waiters) { + lock->waiters = 0; + os_event_set(lock->event); + sync_array_object_signalled(); + } + } else { + /* still has x-lock */ + ut_ad(lock_word == -X_LOCK_HALF_DECR || + lock_word <= -(X_LOCK_DECR + X_LOCK_HALF_DECR)); + lock->lock_word.fetch_add(X_LOCK_HALF_DECR); + } + } + + ut_ad(rw_lock_validate(lock)); +} + +#ifdef UNIV_PFS_RWLOCK + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func(). +NOTE! Please use the corresponding macro rw_lock_create(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: key registered with + performance schema */ + rw_lock_t* lock, /*!< in/out: pointer to memory */ +# ifdef UNIV_DEBUG + latch_level_t level, /*!< in: level */ +# endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + unsigned cline) /*!< in: file line where created */ +{ + ut_d(new(lock) rw_lock_t()); + + /* Initialize the rwlock for performance schema */ + lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock); + + /* The actual function to initialize an rwlock */ + rw_lock_create_func(lock, +#ifdef UNIV_DEBUG + level, +#endif /* UNIV_DEBUG */ + cfile_name, + cline); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + unsigned line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the acquisition of a read-write lock in exclusive + mode in performance schema */ + + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_EXCLUSIVELOCK, + file_name, static_cast<uint>(line)); + + rw_lock_x_lock_func( + lock, pass, file_name, static_cast<uint>(line)); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0); + } + } else { + rw_lock_x_lock_func(lock, pass, file_name, line); + } +} +/******************************************************************//** +Performance schema instrumented wrap function for +rw_lock_x_lock_func_nowait() +NOTE! Please use the corresponding macro rw_lock_x_lock_func(), +not directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_x_lock_func_nowait( +/*===========================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock + requested */ + unsigned line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the acquisition of a read-write trylock in exclusive + mode in performance schema */ + + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_TRYEXCLUSIVELOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_x_lock_func_nowait(lock, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)( + locker, static_cast<int>(ret)); + } + } else { + ret = rw_lock_x_lock_func_nowait(lock, file_name, line); + } + + return(ret); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + rw_lock_t* lock) /*!< in: pointer to rw-lock */ +{ + if (lock->pfs_psi != NULL) { + PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi); + lock->pfs_psi = NULL; + } + + rw_lock_free_func(lock); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name,/*!< in: file name where lock + requested */ + unsigned line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, PSI_RWLOCK_SHAREDLOCK, + file_name, static_cast<uint>(line)); + + rw_lock_s_lock_func(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0); + } + } else { + rw_lock_s_lock_func(lock, pass, file_name, line); + } +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_sx_lock_func() +NOTE! Please use the corresponding macro rw_lock_sx_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_sx_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name,/*!< in: file name where lock + requested */ + unsigned line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_SHAREDEXCLUSIVELOCK, + file_name, static_cast<uint>(line)); + + rw_lock_sx_lock_func(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0); + } + } else { + rw_lock_sx_lock_func(lock, pass, file_name, line); + } +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + unsigned line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, PSI_RWLOCK_TRYSHAREDLOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_s_lock_low(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)( + locker, static_cast<int>(ret)); + } + } else { + ret = rw_lock_s_lock_low(lock, pass, file_name, line); + } + + return(ret); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_sx_lock_nowait() +NOTE! Please use the corresponding macro, not +directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_sx_lock_low( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + unsigned line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared + exclusive rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, + PSI_RWLOCK_TRYSHAREDEXCLUSIVELOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_sx_lock_low(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)( + locker, static_cast<int>(ret)); + } + } else { + ret = rw_lock_sx_lock_low(lock, pass, file_name, line); + } + + return(ret); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) { + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + } + + rw_lock_x_unlock_func( +#ifdef UNIV_DEBUG + pass, +#endif /* UNIV_DEBUG */ + lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_sx_unlock_func() +NOTE! Please use the corresponding macro rw_lock_sx_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_sx_unlock_func( +/*======================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) { + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + } + + rw_lock_sx_unlock_func( +#ifdef UNIV_DEBUG + pass, +#endif /* UNIV_DEBUG */ + lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif /* UNIV_DEBUG */ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) { + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + } + + rw_lock_s_unlock_func( +#ifdef UNIV_DEBUG + pass, +#endif /* UNIV_DEBUG */ + lock); + +} +#endif /* UNIV_PFS_RWLOCK */ diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h new file mode 100644 index 00000000..b7f3cff2 --- /dev/null +++ b/storage/innobase/include/sync0sync.h @@ -0,0 +1,107 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2020, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0sync.h +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0sync_h +#define sync0sync_h + +#include "univ.i" + +#ifdef UNIV_PFS_MUTEX +/* Key defines to register InnoDB mutexes with performance schema */ +extern mysql_pfs_key_t buf_pool_mutex_key; +extern mysql_pfs_key_t dict_foreign_err_mutex_key; +extern mysql_pfs_key_t dict_sys_mutex_key; +extern mysql_pfs_key_t fil_system_mutex_key; +extern mysql_pfs_key_t flush_list_mutex_key; +extern mysql_pfs_key_t fts_delete_mutex_key; +extern mysql_pfs_key_t fts_doc_id_mutex_key; +extern mysql_pfs_key_t fts_pll_tokenize_mutex_key; +extern mysql_pfs_key_t ibuf_bitmap_mutex_key; +extern mysql_pfs_key_t ibuf_mutex_key; +extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +extern mysql_pfs_key_t log_sys_mutex_key; +extern mysql_pfs_key_t log_cmdq_mutex_key; +extern mysql_pfs_key_t log_flush_order_mutex_key; +extern mysql_pfs_key_t recalc_pool_mutex_key; +extern mysql_pfs_key_t purge_sys_pq_mutex_key; +extern mysql_pfs_key_t recv_sys_mutex_key; +extern mysql_pfs_key_t rtr_active_mutex_key; +extern mysql_pfs_key_t rtr_match_mutex_key; +extern mysql_pfs_key_t rtr_path_mutex_key; +extern mysql_pfs_key_t redo_rseg_mutex_key; +extern mysql_pfs_key_t noredo_rseg_mutex_key; +extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +# ifdef UNIV_DEBUG +extern mysql_pfs_key_t rw_lock_debug_mutex_key; +# endif /* UNIV_DEBUG */ +extern mysql_pfs_key_t rw_lock_list_mutex_key; +extern mysql_pfs_key_t srv_innodb_monitor_mutex_key; +extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key; +extern mysql_pfs_key_t srv_monitor_file_mutex_key; +extern mysql_pfs_key_t buf_dblwr_mutex_key; +extern mysql_pfs_key_t trx_mutex_key; +extern mysql_pfs_key_t trx_pool_mutex_key; +extern mysql_pfs_key_t trx_pool_manager_mutex_key; +extern mysql_pfs_key_t lock_mutex_key; +extern mysql_pfs_key_t lock_wait_mutex_key; +extern mysql_pfs_key_t trx_sys_mutex_key; +extern mysql_pfs_key_t srv_threads_mutex_key; +extern mysql_pfs_key_t sync_array_mutex_key; +extern mysql_pfs_key_t thread_mutex_key; +extern mysql_pfs_key_t row_drop_list_mutex_key; +extern mysql_pfs_key_t rw_trx_hash_element_mutex_key; +extern mysql_pfs_key_t read_view_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_PFS_RWLOCK +/* Following are rwlock keys used to register with MySQL +performance schema */ +extern mysql_pfs_key_t btr_search_latch_key; +extern mysql_pfs_key_t dict_operation_lock_key; +extern mysql_pfs_key_t fil_space_latch_key; +extern mysql_pfs_key_t fts_cache_rw_lock_key; +extern mysql_pfs_key_t fts_cache_init_rw_lock_key; +extern mysql_pfs_key_t trx_i_s_cache_lock_key; +extern mysql_pfs_key_t trx_purge_latch_key; +extern mysql_pfs_key_t index_tree_rw_lock_key; +extern mysql_pfs_key_t index_online_log_key; +extern mysql_pfs_key_t trx_sys_rw_lock_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** Prints info of the sync system. +@param[in] file where to print */ +void +sync_print(FILE* file); + +#endif /* !sync0sync_h */ diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h new file mode 100644 index 00000000..feb1e3b4 --- /dev/null +++ b/storage/innobase/include/sync0types.h @@ -0,0 +1,1060 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0types.h +Global types for sync + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0types_h +#define sync0types_h + +#include <vector> + +#include "ut0new.h" + +#ifdef _WIN32 +/** Native mutex */ +typedef CRITICAL_SECTION sys_mutex_t; +#else +/** Native mutex */ +typedef pthread_mutex_t sys_mutex_t; +#endif /* _WIN32 */ + +/** Mutex states. */ +enum mutex_state_t { + /** Mutex is free */ + MUTEX_STATE_UNLOCKED = 0, + + /** Mutex is acquired by some thread. */ + MUTEX_STATE_LOCKED = 1, + + /** Mutex is contended and there are threads waiting on the lock. */ + MUTEX_STATE_WAITERS = 2 +}; + +/* + LATCHING ORDER WITHIN THE DATABASE + ================================== + +The mutex or latch in the central memory object, for instance, a rollback +segment object, must be acquired before acquiring the latch or latches to +the corresponding file data structure. In the latching order below, these +file page object latches are placed immediately below the corresponding +central memory object latch or mutex. + +Synchronization object Notes +---------------------- ----- + +Dictionary mutex If we have a pointer to a dictionary +| object, e.g., a table, it can be +| accessed without reserving the +| dictionary mutex. We must have a +| reservation, a memoryfix, to the +| appropriate table object in this case, +| and the table must be explicitly +| released later. +V +Dictionary header +| +V +Secondary index tree latch The tree latch protects also all +| the B-tree non-leaf pages. These +V can be read with the page only +Secondary index non-leaf bufferfixed to save CPU time, +| no s-latch is needed on the page. +| Modification of a page requires an +| x-latch on the page, however. If a +| thread owns an x-latch to the tree, +| it is allowed to latch non-leaf pages +| even after it has acquired the fsp +| latch. +V +Secondary index leaf The latch on the secondary index leaf +| can be kept while accessing the +| clustered index, to save CPU time. +V +Clustered index tree latch To increase concurrency, the tree +| latch is usually released when the +| leaf page latch has been acquired. +V +Clustered index non-leaf +| +V +Clustered index leaf +| +V +Transaction system header +| +V +Rollback segment mutex The rollback segment mutex must be +| reserved, if, e.g., a new page must +| be added to an undo log. The rollback +| segment and the undo logs in its +| history list can be seen as an +| analogue of a B-tree, and the latches +| reserved similarly, using a version of +| lock-coupling. If an undo log must be +| extended by a page when inserting an +| undo log record, this corresponds to +| a pessimistic insert in a B-tree. +V +Rollback segment header +| +V +Purge system latch +| +V +Undo log pages If a thread owns the trx undo mutex, +| or for a log in the history list, the +| rseg mutex, it is allowed to latch +| undo log pages in any order, and even +| after it has acquired the fsp latch. +| If a thread does not have the +| appropriate mutex, it is allowed to +| latch only a single undo log page in +| a mini-transaction. +V +File space management latch If a mini-transaction must allocate +| several file pages, it can do that, +| because it keeps the x-latch to the +| file space management in its memo. +V +File system pages +| +V +lock_sys_wait_mutex Mutex protecting lock timeout data +| +V +lock_sys_mutex Mutex protecting lock_sys_t +| +V +trx_sys.mutex Mutex protecting trx_sys.trx_list +| +V +Threads mutex Background thread scheduling mutex +| +V +query_thr_mutex Mutex protecting query threads +| +V +trx_mutex Mutex protecting trx_t fields +| +V +Search system mutex +| +V +Buffer pool mutex +| +V +Log mutex +| +Any other latch +| +V +Memory pool mutex */ + +/** Latching order levels. If you modify these, you have to also update +LatchDebug internals in sync0debug.cc */ + +enum latch_level_t { + SYNC_UNKNOWN = 0, + + SYNC_MUTEX = 1, + + RW_LOCK_SX, + RW_LOCK_X_WAIT, + RW_LOCK_S, + RW_LOCK_X, + RW_LOCK_NOT_LOCKED, + + SYNC_ANY_LATCH, + + SYNC_POOL, + SYNC_POOL_MANAGER, + + SYNC_SEARCH_SYS, + + SYNC_WORK_QUEUE, + + SYNC_FTS_TOKENIZE, + SYNC_FTS_OPTIMIZE, + SYNC_FTS_CACHE_INIT, + SYNC_RECV, + SYNC_PURGE_QUEUE, + SYNC_TRX_SYS_HEADER, + SYNC_TRX, + SYNC_RW_TRX_HASH_ELEMENT, + SYNC_READ_VIEW, + SYNC_TRX_SYS, + SYNC_LOCK_SYS, + SYNC_LOCK_WAIT_SYS, + + SYNC_INDEX_ONLINE_LOG, + + SYNC_IBUF_BITMAP, + SYNC_IBUF_BITMAP_MUTEX, + SYNC_IBUF_TREE_NODE, + SYNC_IBUF_TREE_NODE_NEW, + SYNC_IBUF_INDEX_TREE, + + SYNC_IBUF_MUTEX, + + SYNC_FSP_PAGE, + SYNC_FSP, + SYNC_EXTERN_STORAGE, + SYNC_TRX_UNDO_PAGE, + SYNC_RSEG_HEADER, + SYNC_RSEG_HEADER_NEW, + SYNC_NOREDO_RSEG, + SYNC_REDO_RSEG, + SYNC_PURGE_LATCH, + SYNC_TREE_NODE, + SYNC_TREE_NODE_FROM_HASH, + SYNC_TREE_NODE_NEW, + SYNC_IBUF_PESS_INSERT_MUTEX, + SYNC_INDEX_TREE, + + SYNC_IBUF_HEADER, + SYNC_DICT_HEADER, + SYNC_STATS_AUTO_RECALC, + SYNC_DICT, + SYNC_FTS_CACHE, + + SYNC_DICT_OPERATION, + + SYNC_TRX_I_S_RWLOCK, + + /** Level is varying. Only used with buffer pool page locks, which + do not have a fixed level, but instead have their level set after + the page is locked; see e.g. ibuf_bitmap_get_map_page(). */ + + SYNC_LEVEL_VARYING, + + /** This can be used to suppress order checking. */ + SYNC_NO_ORDER_CHECK, + + /** Maximum level value */ + SYNC_LEVEL_MAX = SYNC_NO_ORDER_CHECK +}; + +/** Each latch has an ID. This id is used for creating the latch and to look +up its meta-data. See sync0debug.cc. */ +enum latch_id_t { + LATCH_ID_NONE = 0, + LATCH_ID_DICT_FOREIGN_ERR, + LATCH_ID_DICT_SYS, + LATCH_ID_FIL_SYSTEM, + LATCH_ID_FTS_DELETE, + LATCH_ID_FTS_DOC_ID, + LATCH_ID_FTS_PLL_TOKENIZE, + LATCH_ID_IBUF_BITMAP, + LATCH_ID_IBUF, + LATCH_ID_IBUF_PESSIMISTIC_INSERT, + LATCH_ID_PURGE_SYS_PQ, + LATCH_ID_RECALC_POOL, + LATCH_ID_RECV_SYS, + LATCH_ID_REDO_RSEG, + LATCH_ID_NOREDO_RSEG, + LATCH_ID_RW_LOCK_DEBUG, + LATCH_ID_RTR_ACTIVE_MUTEX, + LATCH_ID_RTR_MATCH_MUTEX, + LATCH_ID_RTR_PATH_MUTEX, + LATCH_ID_RW_LOCK_LIST, + LATCH_ID_SRV_INNODB_MONITOR, + LATCH_ID_SRV_MISC_TMPFILE, + LATCH_ID_SRV_MONITOR_FILE, + LATCH_ID_TRX_POOL, + LATCH_ID_TRX_POOL_MANAGER, + LATCH_ID_TRX, + LATCH_ID_LOCK_SYS, + LATCH_ID_LOCK_SYS_WAIT, + LATCH_ID_TRX_SYS, + LATCH_ID_SRV_SYS_TASKS, + LATCH_ID_PAGE_ZIP_STAT_PER_INDEX, + LATCH_ID_SYNC_ARRAY_MUTEX, + LATCH_ID_ROW_DROP_LIST, + LATCH_ID_INDEX_ONLINE_LOG, + LATCH_ID_WORK_QUEUE, + LATCH_ID_BTR_SEARCH, + LATCH_ID_BUF_BLOCK_LOCK, + LATCH_ID_BUF_BLOCK_DEBUG, + LATCH_ID_DICT_OPERATION, + LATCH_ID_FIL_SPACE, + LATCH_ID_FTS_CACHE, + LATCH_ID_FTS_CACHE_INIT, + LATCH_ID_TRX_I_S_CACHE, + LATCH_ID_TRX_PURGE, + LATCH_ID_IBUF_INDEX_TREE, + LATCH_ID_INDEX_TREE, + LATCH_ID_DICT_TABLE_STATS, + LATCH_ID_DEFRAGMENT_MUTEX, + LATCH_ID_BTR_DEFRAGMENT_MUTEX, + LATCH_ID_FIL_CRYPT_STAT_MUTEX, + LATCH_ID_FIL_CRYPT_DATA_MUTEX, + LATCH_ID_FIL_CRYPT_THREADS_MUTEX, + LATCH_ID_RW_TRX_HASH_ELEMENT, + LATCH_ID_READ_VIEW, + LATCH_ID_MAX = LATCH_ID_READ_VIEW +}; + +#ifndef UNIV_INNOCHECKSUM +/** OS mutex, without any policy. It is a thin wrapper around the +system mutexes. The interface is different from the policy mutexes, +to ensure that it is called directly and not confused with the +policy mutexes. */ +struct OSMutex { + + /** Constructor */ + OSMutex() + UNIV_NOTHROW + { + ut_d(m_freed = true); + } + + /** Create the mutex by calling the system functions. */ + void init() + UNIV_NOTHROW + { + ut_ad(m_freed); + +#ifdef _WIN32 + InitializeCriticalSection((LPCRITICAL_SECTION) &m_mutex); +#else + { + int ret = pthread_mutex_init(&m_mutex, NULL); + ut_a(ret == 0); + } +#endif /* _WIN32 */ + + ut_d(m_freed = false); + } + + /** Destructor */ + ~OSMutex() { } + + /** Destroy the mutex */ + void destroy() + UNIV_NOTHROW + { + ut_ad(!m_freed); +#ifdef _WIN32 + DeleteCriticalSection((LPCRITICAL_SECTION) &m_mutex); +#else + int ret; + + ret = pthread_mutex_destroy(&m_mutex); + + if (ret != 0) { + + ib::error() + << "Return value " << ret << " when calling " + << "pthread_mutex_destroy()."; + } +#endif /* _WIN32 */ + ut_d(m_freed = true); + } + + /** Release the mutex. */ + void exit() + UNIV_NOTHROW + { + ut_ad(!m_freed); +#ifdef _WIN32 + LeaveCriticalSection(&m_mutex); +#else + int ret = pthread_mutex_unlock(&m_mutex); + ut_a(ret == 0); +#endif /* _WIN32 */ + } + + /** Acquire the mutex. */ + void enter() + UNIV_NOTHROW + { + ut_ad(!m_freed); +#ifdef _WIN32 + EnterCriticalSection((LPCRITICAL_SECTION) &m_mutex); +#else + int ret = pthread_mutex_lock(&m_mutex); + ut_a(ret == 0); +#endif /* _WIN32 */ + } + + /** @return true if locking succeeded */ + bool try_lock() + UNIV_NOTHROW + { + ut_ad(!m_freed); +#ifdef _WIN32 + return(TryEnterCriticalSection(&m_mutex) != 0); +#else + return(pthread_mutex_trylock(&m_mutex) == 0); +#endif /* _WIN32 */ + } + + /** Required for os_event_t */ + operator sys_mutex_t*() + UNIV_NOTHROW + { + return(&m_mutex); + } + +private: +#ifdef DBUG_ASSERT_EXISTS + /** true if the mutex has been freed/destroyed. */ + bool m_freed; +#endif /* DBUG_ASSERT_EXISTS */ + + sys_mutex_t m_mutex; +}; + +#ifdef UNIV_PFS_MUTEX +/** Latch element. +Used for mutexes which have PFS keys defined under UNIV_PFS_MUTEX. +@param[in] id Latch id +@param[in] level Latch level +@param[in] key PFS key */ +# define LATCH_ADD_MUTEX(id, level, key) latch_meta[LATCH_ID_ ## id] =\ + UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level, key)) + +#ifdef UNIV_PFS_RWLOCK +/** Latch element. +Used for rwlocks which have PFS keys defined under UNIV_PFS_RWLOCK. +@param[in] id Latch id +@param[in] level Latch level +@param[in] key PFS key */ +# define LATCH_ADD_RWLOCK(id, level, key) latch_meta[LATCH_ID_ ## id] =\ + UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level, key)) +#else +# define LATCH_ADD_RWLOCK(id, level, key) latch_meta[LATCH_ID_ ## id] =\ + UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level, \ + PSI_NOT_INSTRUMENTED)) +#endif /* UNIV_PFS_RWLOCK */ + +#else +# define LATCH_ADD_MUTEX(id, level, key) latch_meta[LATCH_ID_ ## id] =\ + UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level)) +# define LATCH_ADD_RWLOCK(id, level, key) latch_meta[LATCH_ID_ ## id] =\ + UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level)) +#endif /* UNIV_PFS_MUTEX */ + +/** Default latch counter */ +class LatchCounter { + +public: + /** The counts we collect for a mutex */ + struct Count { + + /** Constructor */ + Count() + UNIV_NOTHROW + : + m_spins(), + m_waits(), + m_calls(), + m_enabled() + { + /* No op */ + } + + /** Rest the values to zero */ + void reset() + UNIV_NOTHROW + { + m_spins = 0; + m_waits = 0; + m_calls = 0; + } + + /** Number of spins trying to acquire the latch. */ + uint32_t m_spins; + + /** Number of waits trying to acquire the latch */ + uint32_t m_waits; + + /** Number of times it was called */ + uint32_t m_calls; + + /** true if enabled */ + bool m_enabled; + }; + + /** Constructor */ + LatchCounter() + UNIV_NOTHROW + : + m_active(false) + { + m_mutex.init(); + } + + /** Destructor */ + ~LatchCounter() + UNIV_NOTHROW + { + m_mutex.destroy(); + + for (Counters::iterator it = m_counters.begin(); + it != m_counters.end(); + ++it) { + + Count* count = *it; + + UT_DELETE(count); + } + } + + /** Reset all counters to zero. It is not protected by any + mutex and we don't care about atomicity. Unless it is a + demonstrated problem. The information collected is not + required for the correct functioning of the server. */ + void reset() + UNIV_NOTHROW + { + m_mutex.enter(); + + Counters::iterator end = m_counters.end(); + + for (Counters::iterator it = m_counters.begin(); + it != end; + ++it) { + + (*it)->reset(); + } + + m_mutex.exit(); + } + + /** @return the aggregate counter */ + Count* sum_register() + UNIV_NOTHROW + { + m_mutex.enter(); + + Count* count; + + if (m_counters.empty()) { + count = UT_NEW_NOKEY(Count()); + m_counters.push_back(count); + } else { + ut_a(m_counters.size() == 1); + count = m_counters[0]; + } + + m_mutex.exit(); + + return(count); + } + + /** Register a single instance counter */ + void single_register(Count* count) + UNIV_NOTHROW + { + m_mutex.enter(); + + m_counters.push_back(count); + + m_mutex.exit(); + } + + /** Deregister a single instance counter + @param[in] count The count instance to deregister */ + void single_deregister(Count* count) + UNIV_NOTHROW + { + m_mutex.enter(); + + m_counters.erase( + std::remove( + m_counters.begin(), + m_counters.end(), count), + m_counters.end()); + + m_mutex.exit(); + } + + /** Iterate over the counters */ + template<typename C> void iterate(const C& callback) UNIV_NOTHROW + { + m_mutex.enter(); + + Counters::const_iterator end = m_counters.end(); + + for (Counters::const_iterator it = m_counters.begin(); + it != end; + ++it) { + + callback(*it); + } + + m_mutex.exit(); + } + + /** Disable the monitoring */ + void enable() + UNIV_NOTHROW + { + m_mutex.enter(); + + Counters::const_iterator end = m_counters.end(); + + for (Counters::const_iterator it = m_counters.begin(); + it != end; + ++it) { + + (*it)->m_enabled = true; + } + + m_active = true; + + m_mutex.exit(); + } + + /** Disable the monitoring */ + void disable() + UNIV_NOTHROW + { + m_mutex.enter(); + + Counters::const_iterator end = m_counters.end(); + + for (Counters::const_iterator it = m_counters.begin(); + it != end; + ++it) { + + (*it)->m_enabled = false; + } + + m_active = false; + + m_mutex.exit(); + } + + /** @return if monitoring is active */ + bool is_enabled() const + UNIV_NOTHROW + { + return(m_active); + } + +private: + /* Disable copying */ + LatchCounter(const LatchCounter&); + LatchCounter& operator=(const LatchCounter&); + +private: + typedef OSMutex Mutex; + typedef std::vector<Count*> Counters; + + /** Mutex protecting m_counters */ + Mutex m_mutex; + + /** Counters for the latches */ + Counters m_counters; + + /** if true then we collect the data */ + bool m_active; +}; + +/** Latch meta data */ +template <typename Counter = LatchCounter> +class LatchMeta { + +public: + typedef Counter CounterType; + +#ifdef UNIV_PFS_MUTEX + typedef mysql_pfs_key_t pfs_key_t; +#endif /* UNIV_PFS_MUTEX */ + + /** Constructor */ + LatchMeta() + : + m_id(LATCH_ID_NONE), + m_name(), + m_level(SYNC_UNKNOWN), + m_level_name() +#ifdef UNIV_PFS_MUTEX + ,m_pfs_key() +#endif /* UNIV_PFS_MUTEX */ + { + } + + /** Destructor */ + ~LatchMeta() { } + + /** Constructor + @param[in] id Latch id + @param[in] name Latch name + @param[in] level Latch level + @param[in] level_name Latch level text representation + @param[in] key PFS key */ + LatchMeta( + latch_id_t id, + const char* name, + latch_level_t level, + const char* level_name +#ifdef UNIV_PFS_MUTEX + ,pfs_key_t key +#endif /* UNIV_PFS_MUTEX */ + ) + : + m_id(id), + m_name(name), + m_level(level), + m_level_name(level_name) +#ifdef UNIV_PFS_MUTEX + ,m_pfs_key(key) +#endif /* UNIV_PFS_MUTEX */ + { + /* No op */ + } + + /* Less than operator. + @param[in] rhs Instance to compare against + @return true if this.get_id() < rhs.get_id() */ + bool operator<(const LatchMeta& rhs) const + { + return(get_id() < rhs.get_id()); + } + + /** @return the latch id */ + latch_id_t get_id() const + { + return(m_id); + } + + /** @return the latch name */ + const char* get_name() const + { + return(m_name); + } + + /** @return the latch level */ + latch_level_t get_level() const + { + return(m_level); + } + + /** @return the latch level name */ + const char* get_level_name() const + { + return(m_level_name); + } + +#ifdef UNIV_PFS_MUTEX + /** @return the PFS key for the latch */ + pfs_key_t get_pfs_key() const + { + return(m_pfs_key); + } +#endif /* UNIV_PFS_MUTEX */ + + /** @return the counter instance */ + Counter* get_counter() + { + return(&m_counter); + } + +private: + /** Latch id */ + latch_id_t m_id; + + /** Latch name */ + const char* m_name; + + /** Latch level in the ordering */ + latch_level_t m_level; + + /** Latch level text representation */ + const char* m_level_name; + +#ifdef UNIV_PFS_MUTEX + /** PFS key */ + pfs_key_t m_pfs_key; +#endif /* UNIV_PFS_MUTEX */ + + /** For gathering latch statistics */ + Counter m_counter; +}; + +typedef LatchMeta<LatchCounter> latch_meta_t; +typedef std::vector<latch_meta_t*, ut_allocator<latch_meta_t*> > LatchMetaData; + +/** Note: This is accessed without any mutex protection. It is initialised +at startup and elements should not be added to or removed from it after +that. See sync_latch_meta_init() */ +extern LatchMetaData latch_meta; + +/** Get the latch meta-data from the latch ID +@param[in] id Latch ID +@return the latch meta data */ +inline +latch_meta_t& +sync_latch_get_meta(latch_id_t id) +{ + ut_ad(static_cast<size_t>(id) < latch_meta.size()); + ut_ad(id == latch_meta[id]->get_id()); + + return(*latch_meta[id]); +} + +/** Fetch the counter for the latch +@param[in] id Latch ID +@return the latch counter */ +inline +latch_meta_t::CounterType* +sync_latch_get_counter(latch_id_t id) +{ + latch_meta_t& meta = sync_latch_get_meta(id); + + return(meta.get_counter()); +} + +/** Get the latch name from the latch ID +@param[in] id Latch ID +@return the name, will assert if not found */ +inline +const char* +sync_latch_get_name(latch_id_t id) +{ + const latch_meta_t& meta = sync_latch_get_meta(id); + + return(meta.get_name()); +} + +/** Get the latch ordering level +@param[in] id Latch id to lookup +@return the latch level */ +inline +latch_level_t +sync_latch_get_level(latch_id_t id) +{ + const latch_meta_t& meta = sync_latch_get_meta(id); + + return(meta.get_level()); +} + +#ifdef UNIV_PFS_MUTEX +/** Get the latch PFS key from the latch ID +@param[in] id Latch ID +@return the PFS key */ +inline +mysql_pfs_key_t +sync_latch_get_pfs_key(latch_id_t id) +{ + const latch_meta_t& meta = sync_latch_get_meta(id); + + return(meta.get_pfs_key()); +} +#endif + +/** String representation of the filename and line number where the +latch was created +@param[in] id Latch ID +@param[in] created Filename and line number where it was crated +@return the string representation */ +std::string +sync_mutex_to_string( + latch_id_t id, + const std::string& created); + +/** Get the latch name from a sync level +@param[in] level Latch level to lookup +@return 0 if not found. */ +const char* +sync_latch_get_name(latch_level_t level); + +/** Print the filename "basename" +@return the basename */ +const char* +sync_basename(const char* filename); + +#ifdef UNIV_DEBUG + +/** All (ordered) latches, used in debugging, must derive from this class. */ +struct latch_t { + + /** Constructor + @param[in] id The latch ID */ + explicit latch_t(latch_id_t id = LATCH_ID_NONE) + UNIV_NOTHROW + : + m_id(id), + m_rw_lock() {} + + /** Destructor */ + virtual ~latch_t() UNIV_NOTHROW { } + + /** @return the latch ID */ + latch_id_t get_id() const + { + return(m_id); + } + + /** @return true if it is a rw-lock */ + bool is_rw_lock() const + UNIV_NOTHROW + { + return(m_rw_lock); + } + + /** Print the latch context + @return the string representation */ + virtual std::string to_string() const = 0; + + /** @return the latch level */ + latch_level_t get_level() const + UNIV_NOTHROW + { + ut_a(m_id != LATCH_ID_NONE); + + return(sync_latch_get_level(m_id)); + } + + /** @return the latch name, m_id must be set */ + const char* get_name() const + UNIV_NOTHROW + { + ut_a(m_id != LATCH_ID_NONE); + + return(sync_latch_get_name(m_id)); + } + + /** Latch ID */ + latch_id_t m_id; + + /** true if it is a rw-lock. In debug mode, rw_lock_t derives from + this class and sets this variable. */ + bool m_rw_lock; +}; + +/** Subclass this to iterate over a thread's acquired latch levels. */ +struct sync_check_functor_t { + virtual ~sync_check_functor_t() { } + virtual bool operator()(const latch_level_t) const = 0; +}; + +/** Check that no latch is being held. +@tparam some_allowed whether some latches are allowed to be held */ +template<bool some_allowed = false> +struct sync_checker : public sync_check_functor_t +{ + /** Check the latching constraints + @param[in] level The level held by the thread + @return whether a latch violation was detected */ + bool operator()(const latch_level_t level) const override + { + if (some_allowed) { + switch (level) { + case SYNC_FSP: + case SYNC_DICT: + case SYNC_DICT_OPERATION: + case SYNC_FTS_CACHE: + case SYNC_NO_ORDER_CHECK: + return(false); + default: + return(true); + } + } + + return(true); + } +}; + +/** The strict latch checker (no InnoDB latches may be held) */ +typedef struct sync_checker<false> sync_check; +/** The sloppy latch checker (can hold InnoDB dictionary or SQL latches) */ +typedef struct sync_checker<true> dict_sync_check; + +/** Functor to check for given latching constraints. */ +struct sync_allowed_latches : public sync_check_functor_t { + + /** Constructor + @param[in] from first element in an array of latch_level_t + @param[in] to last element in an array of latch_level_t */ + sync_allowed_latches( + const latch_level_t* from, + const latch_level_t* to) + : begin(from), end(to) { } + + /** Checks whether the given latch_t violates the latch constraint. + This object maintains a list of allowed latch levels, and if the given + latch belongs to a latch level that is not there in the allowed list, + then it is a violation. + + @param[in] latch The latch level to check + @return true if there is a latch violation */ + bool operator()(const latch_level_t level) const override + { + return(std::find(begin, end, level) == end); + } + +private: + /** First element in an array of allowed latch levels */ + const latch_level_t* const begin; + /** First element after the end of the array of allowed latch levels */ + const latch_level_t* const end; +}; + +/** Get the latch id from a latch name. +@param[in] id Latch name +@return LATCH_ID_NONE. */ +latch_id_t +sync_latch_get_id(const char* name); + +typedef ulint rw_lock_flags_t; + +/* Flags to specify lock types for rw_lock_own_flagged() */ +enum rw_lock_flag_t { + RW_LOCK_FLAG_S = 1 << 0, + RW_LOCK_FLAG_X = 1 << 1, + RW_LOCK_FLAG_SX = 1 << 2 +}; + +#endif /* UNIV_DBEUG */ + +#endif /* UNIV_INNOCHECKSUM */ + +/** Simple non-atomic counter aligned to CACHE_LINE_SIZE +@tparam Type the integer type of the counter */ +template <typename Type> +struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter +{ + /** Increment the counter */ + Type inc() { return add(1); } + /** Decrement the counter */ + Type dec() { return add(Type(~0)); } + + /** Add to the counter + @param[in] i amount to be added + @return the value of the counter after adding */ + Type add(Type i) { return m_counter += i; } + + /** @return the value of the counter */ + operator Type() const { return m_counter; } + +private: + /** The counter */ + Type m_counter; +}; +#endif /* sync0types_h */ diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h new file mode 100644 index 00000000..40160ce4 --- /dev/null +++ b/storage/innobase/include/trx0i_s.h @@ -0,0 +1,278 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0i_s.h +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables cache structures and public +functions. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#ifndef trx0i_s_h +#define trx0i_s_h + +#include "trx0types.h" +#include "dict0types.h" +#include "buf0types.h" + +/** The maximum amount of memory that can be consumed by innodb_trx, +innodb_locks and innodb_lock_waits information schema tables. */ +#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */ + +/** The maximum length of a string that can be stored in +i_s_locks_row_t::lock_data */ +#define TRX_I_S_LOCK_DATA_MAX_LEN 8192 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_query */ +#define TRX_I_S_TRX_QUERY_MAX_LEN 1024 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_foreign_key_error */ +#define TRX_I_S_TRX_FK_ERROR_MAX_LEN 256 + +/** Safely copy strings in to the INNODB_TRX table's +string based columns */ +#define TRX_I_S_STRING_COPY(data, field, constraint, tcache) \ +do { \ + if (strlen(data) > constraint) { \ + char buff[constraint + 1]; \ + strncpy(buff, data, constraint); \ + buff[constraint] = '\0'; \ + \ + field = static_cast<const char*>( \ + ha_storage_put_memlim( \ + (tcache)->storage, buff, constraint + 1,\ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } else { \ + field = static_cast<const char*>( \ + ha_storage_put_str_memlim( \ + (tcache)->storage, data, \ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } \ +} while (0) + +/** A row of INFORMATION_SCHEMA.innodb_locks */ +struct i_s_locks_row_t; + +/** Objects of trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t; + +/** Objects of this type are added to the hash table +trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t { + i_s_locks_row_t* value; /*!< row of + INFORMATION_SCHEMA.innodb_locks*/ + i_s_hash_chain_t* next; /*!< next item in the hash chain */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_locks row */ +struct i_s_locks_row_t { + trx_id_t lock_trx_id; /*!< transaction identifier */ + const char* lock_table; /*!< table name from + lock_get_table_name() */ + /** index name of a record lock; NULL for table locks */ + const char* lock_index; + /** page identifier of the record; (0,0) if !lock_index */ + page_id_t lock_page; + /** heap number of the record; 0 if !lock_index */ + uint16_t lock_rec; + /** lock mode corresponding to lock_mode_values_typelib */ + uint8_t lock_mode; + /** (some) content of the record, if available in the buffer pool; + NULL if !lock_index */ + const char* lock_data; + + /** The following are auxiliary and not included in the table */ + /* @{ */ + table_id_t lock_table_id; + /*!< table identifier from + lock_get_table_id */ + i_s_hash_chain_t hash_chain; /*!< hash table chain node for + trx_i_s_cache_t::locks_hash */ + /* @} */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_trx row */ +struct i_s_trx_row_t { + trx_id_t trx_id; /*!< transaction identifier */ + const char* trx_state; /*!< transaction state from + trx_get_que_state_str() */ + time_t trx_started; /*!< trx_t::start_time */ + const i_s_locks_row_t* requested_lock_row; + /*!< pointer to a row + in innodb_locks if trx + is waiting, or NULL */ + time_t trx_wait_started; /*!< trx_t->lock.wait_started */ + uintmax_t trx_weight; /*!< TRX_WEIGHT() */ + ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */ + const char* trx_query; /*!< MySQL statement being + executed in the transaction */ + CHARSET_INFO* trx_query_cs; /*!< the charset of trx_query */ + const char* trx_operation_state; /*!< trx_t::op_info */ + ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in + trx_t */ + ulint trx_tables_locked; + /*!< mysql_n_tables_locked in + trx_t */ + ulint trx_lock_structs;/*!< list len of trx_locks in + trx_t */ + ulint trx_lock_memory_bytes; + /*!< mem_heap_get_size( + trx->lock_heap) */ + ulint trx_rows_locked;/*!< lock_number_of_rows_locked() */ + uintmax_t trx_rows_modified;/*!< trx_t::undo_no */ + uint trx_isolation_level; + /*!< trx_t::isolation_level */ + bool trx_unique_checks; + /*!< check_unique_secondary in trx_t*/ + bool trx_foreign_key_checks; + /*!< check_foreigns in trx_t */ + const char* trx_foreign_key_error; + /*!< detailed_error in trx_t */ + bool trx_is_read_only; + /*!< trx_t::read_only */ + bool trx_is_autocommit_non_locking; + /*!< trx:t::is_autocommit_non_locking() + */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */ +struct i_s_lock_waits_row_t { + const i_s_locks_row_t* requested_lock_row; /*!< requested lock */ + const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */ +}; + +/** Cache of INFORMATION_SCHEMA table data */ +struct trx_i_s_cache_t; + +/** Auxiliary enum used by functions that need to select one of the +INFORMATION_SCHEMA tables */ +enum i_s_table { + I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */ + I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */ + I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +extern trx_i_s_cache_t* trx_i_s_cache; + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache); /*!< out: cache to init */ +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache); /*!< in/out: cache to free */ + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table); /*!< in: which table */ + +/*******************************************************************//** +Retrieves the nth row in the cache for a given INFORMATION SCHEMA +table. +@return row */ +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n); /*!< in: row number */ + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +@return 0 - fetched, 1 - not */ +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache); /*!< in/out: cache */ + +/*******************************************************************//** +Returns true, if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +bool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ +/** The maximum length of a resulting lock_id_size in +trx_i_s_create_lock_id(), not including the terminating NUL. +":%lu:%lu:%lu" -> 63 chars */ +#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63) + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size);/*!< in: size of the lock id + buffer */ + +#endif /* trx0i_s_h */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h new file mode 100644 index 00000000..ef942076 --- /dev/null +++ b/storage/innobase/include/trx0purge.h @@ -0,0 +1,268 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0purge.h +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0purge_h +#define trx0purge_h + +#include "trx0rseg.h" +#include "que0types.h" + +#include <queue> + +/** A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +extern trx_undo_rec_t trx_purge_dummy_rec; + +/** Prepend the history list with an undo log. +Remove the undo log segment from the rseg slot if it is too big for reuse. +@param[in] trx transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ +void +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr); +/** +Run a purge batch. +@param n_tasks number of purge tasks to submit to the queue +@param truncate whether to truncate the history at the end of the batch +@return number of undo log pages handled in the batch */ +ulint trx_purge(ulint n_tasks, bool truncate); + +/** Rollback segements from a given transaction with trx-no +scheduled for purge. */ +class TrxUndoRsegs { +private: + typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> > + trx_rsegs_t; +public: + typedef trx_rsegs_t::iterator iterator; + typedef trx_rsegs_t::const_iterator const_iterator; + + TrxUndoRsegs() {} + + /** Constructor */ + TrxUndoRsegs(trx_rseg_t& rseg) + : trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {} + /** Constructor */ + TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg) + : trx_no(trx_no), m_rsegs(1, &rseg) {} + + bool operator!=(const TrxUndoRsegs& other) const + { return trx_no != other.trx_no; } + bool empty() const { return m_rsegs.empty(); } + void erase(iterator& it) { m_rsegs.erase(it); } + iterator begin() { return(m_rsegs.begin()); } + iterator end() { return(m_rsegs.end()); } + const_iterator begin() const { return m_rsegs.begin(); } + const_iterator end() const { return m_rsegs.end(); } + + /** Compare two TrxUndoRsegs based on trx_no. + @param elem1 first element to compare + @param elem2 second element to compare + @return true if elem1 > elem2 else false.*/ + bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs) + { + return(lhs.trx_no > rhs.trx_no); + } + + /** Copy of trx_rseg_t::last_trx_no() */ + trx_id_t trx_no= 0; +private: + /** Rollback segments of a transaction, scheduled for purge. */ + trx_rsegs_t m_rsegs{}; +}; + +typedef std::priority_queue< + TrxUndoRsegs, + std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >, + TrxUndoRsegs> purge_pq_t; + +/** Chooses the rollback segment with the oldest committed transaction */ +struct TrxUndoRsegsIterator { + /** Constructor */ + TrxUndoRsegsIterator(); + /** Sets the next rseg to purge in purge_sys. + Executed in the purge coordinator thread. + @return whether anything is to be purged */ + inline bool set_next(); + +private: + // Disable copying + TrxUndoRsegsIterator(const TrxUndoRsegsIterator&); + TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&); + + /** The current element to process */ + TrxUndoRsegs m_rsegs; + /** Track the current element in m_rsegs */ + TrxUndoRsegs::const_iterator m_iter; +}; + +/** The control structure used in the purge operation */ +class purge_sys_t +{ +public: + /** latch protecting view, m_enabled */ + MY_ALIGNED(CACHE_LINE_SIZE) + mutable rw_lock_t latch; +private: + /** The purge will not remove undo logs which are >= this view */ + MY_ALIGNED(CACHE_LINE_SIZE) + ReadViewBase view; + /** whether purge is enabled; protected by latch and std::atomic */ + std::atomic<bool> m_enabled; + /** number of pending stop() calls without resume() */ + Atomic_counter<int32_t> m_paused; +public: + que_t* query; /*!< The query graph which will do the + parallelized purge operation */ + + /** Iterator to the undo log records of committed transactions */ + struct iterator + { + bool operator<=(const iterator& other) const + { + if (trx_no < other.trx_no) return true; + if (trx_no > other.trx_no) return false; + return undo_no <= other.undo_no; + } + + /** trx_t::no of the committed transaction */ + trx_id_t trx_no; + /** The record number within the committed transaction's undo + log, increasing, purged from from 0 onwards */ + undo_no_t undo_no; + }; + + /** The tail of the purge queue; the last parsed undo log of a + committed transaction. */ + iterator tail; + /** The head of the purge queue; any older undo logs of committed + transactions may be discarded (history list truncation). */ + iterator head; + /*-----------------------------*/ + bool next_stored; /*!< whether rseg holds the next record + to purge */ + trx_rseg_t* rseg; /*!< Rollback segment for the next undo + record to purge */ + uint32_t page_no; /*!< Page number for the next undo + record to purge, page number of the + log header, if dummy record */ + uint32_t hdr_page_no; /*!< Header page of the undo log where + the next record to purge belongs */ + uint16_t offset; /*!< Page offset for the next undo + record to purge, 0 if the dummy + record */ + uint16_t hdr_offset; /*!< Header byte offset on the page */ + + + TrxUndoRsegsIterator + rseg_iter; /*!< Iterator to get the next rseg + to process */ + + purge_pq_t purge_queue; /*!< Binary min-heap, ordered on + TrxUndoRsegs::trx_no. It is protected + by the pq_mutex */ + PQMutex pq_mutex; /*!< Mutex protecting purge_queue */ + + /** Undo tablespace file truncation (only accessed by the + srv_purge_coordinator_thread) */ + struct { + /** The undo tablespace that is currently being truncated */ + fil_space_t* current; + /** The undo tablespace that was last truncated */ + fil_space_t* last; + } truncate; + + /** Heap for reading the undo log records */ + mem_heap_t* heap; + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + + purge_sys_t(): m_enabled(false), heap(nullptr) {} + + /** Create the instance */ + void create(); + + /** Close the purge system on shutdown */ + void close(); + + /** @return whether purge is enabled */ + bool enabled() { return m_enabled.load(std::memory_order_relaxed); } + /** @return whether the purge coordinator is paused */ + bool paused() + { return m_paused != 0; } + + /** Enable purge at startup. Not protected by latch; the main thread + will wait for purge_sys.enabled() in srv_start() */ + void coordinator_startup() + { + ut_ad(!enabled()); + m_enabled.store(true, std::memory_order_relaxed); + } + + /** Disable purge at shutdown */ + void coordinator_shutdown() + { + ut_ad(enabled()); + m_enabled.store(false, std::memory_order_relaxed); + } + + /** @return whether the purge tasks are active */ + bool running() const; + /** Stop purge during FLUSH TABLES FOR EXPORT */ + void stop(); + /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */ + void resume(); + /** A wrapper around ReadView::changes_visible(). */ + bool changes_visible(trx_id_t id, const table_name_t &name) const + { + ut_ad(rw_lock_own(&latch, RW_LOCK_S)); + return view.changes_visible(id, name); + } + /** A wrapper around ReadView::low_limit_no(). */ + trx_id_t low_limit_no() const + { +#if 0 /* Unfortunately we don't hold this assertion, see MDEV-22718. */ + ut_ad(rw_lock_own(&latch, RW_LOCK_S)); +#endif + return view.low_limit_no(); + } + /** A wrapper around trx_sys_t::clone_oldest_view(). */ + void clone_oldest_view() + { + rw_lock_x_lock(&latch); + trx_sys.clone_oldest_view(&view); + rw_lock_x_unlock(&latch); + } +}; + +/** The global data structure coordinating a purge */ +extern purge_sys_t purge_sys; + +#endif /* trx0purge_h */ diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h new file mode 100644 index 00000000..9aeff631 --- /dev/null +++ b/storage/innobase/include/trx0rec.h @@ -0,0 +1,321 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.h +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rec_h +#define trx0rec_h + +#include "trx0types.h" +#include "row0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "page0types.h" +#include "row0log.h" +#include "que0types.h" + +/***********************************************************************//** +Copies the undo record to the heap. +@return own: copy of undo log record */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + mem_heap_t* heap); /*!< in: heap where copied */ +/**********************************************************************//** +Reads the undo log record type. +@return record type */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +UNIV_INLINE +undo_no_t +trx_undo_rec_get_undo_no( +/*=====================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ + +/**********************************************************************//** +Returns the start of the undo record data area. */ +#define trx_undo_rec_get_ptr(undo_rec, undo_no) \ + ((undo_rec) + trx_undo_rec_get_offset(undo_no)) + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +byte* +trx_undo_rec_get_pars( +/*==================*/ + trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + ulint* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + bool* updated_extern, /*!< out: true if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t**ref, /*!< out, own: row reference */ + mem_heap_t* heap); /*!< in: memory heap from which the memory + needed is allocated */ +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + const byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + byte* info_bits); /*!< out: info bits state */ +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + const byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undorecord */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + byte info_bits,/*!< in: info bits from this undo record */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd); /*!< out, own: update vector */ +/*******************************************************************//** +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. +@return pointer to remaining part of undo record */ +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + const byte* ptr, /*!< in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: updated columns */ + dtuple_t** row, /*!< out, own: partial row */ + ibool ignore_prefix, /*!< in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Report a RENAME TABLE operation. +@param[in,out] trx transaction +@param[in] table table that is being renamed +@return DB_SUCCESS or error code */ +dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +dberr_t +trx_undo_report_row_operation( +/*==========================*/ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index; in updates, + may contain a clustered index + record tuple that also contains + virtual columns of the table; + otherwise, NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: case of an update or delete + marking, the record in the clustered + index; NULL if insert */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the + undo log record */ + MY_ATTRIBUTE((nonnull(1,2,8), warn_unused_result)); + +/** status bit used for trx_undo_prev_version_build() */ + +/** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it +is being called purge view and we would like to get the purge record +even it is in the purge view (in normal case, it will return without +fetching the purge record */ +#define TRX_UNDO_PREV_IN_PURGE 0x1 + +/** This tells trx_undo_prev_version_build() to fetch the old value in +the undo log (which is the after image for an update) */ +#define TRX_UNDO_GET_OLD_V_VALUE 0x2 + +/*******************************************************************//** +Build a previous version of a clustered index record. The caller must +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ +bool +trx_undo_prev_version_build( +/*========================*/ + const rec_t* index_rec,/*!< in: clustered index record in the + index tree */ + mtr_t* index_mtr,/*!< in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /*!< in: version of a clustered index record */ + dict_index_t* index, /*!< in: clustered index */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers,/*!< out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted */ + mem_heap_t* v_heap, /* !< in: memory heap used to create vrow + dtuple if it is not yet created. This heap + diffs from "heap" above in that it could be + prebuilt->old_vers_heap for selection */ + dtuple_t** vrow, /*!< out: virtual column info, if any */ + ulint v_status); + /*!< in: status determine if it is going + into this function by purge thread or not. + And if we read "after image" of undo log */ + +/** Read from an undo log record a non-virtual column value. +@param[in,out] ptr pointer to remaining part of the undo record +@param[in,out] field stored field +@param[in,out] len length of the field, or UNIV_SQL_NULL +@param[in,out] orig_len original length of the locally stored part +of an externally stored column, or 0 +@return remaining part of undo log record after reading these values */ +byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field, + uint32_t *len, uint32_t *orig_len); + +/** Read virtual column value from undo log +@param[in] table the table +@param[in] ptr undo log pointer +@param[in,out] row the dtuple to fill +@param[in] in_purge whether this is called by purge */ +void +trx_undo_read_v_cols( + const dict_table_t* table, + const byte* ptr, + dtuple_t* row, + bool in_purge); + +/** Read virtual column index from undo log if the undo log contains such +info, and verify the column is still indexed, and output its position +@param[in] table the table +@param[in] ptr undo log pointer +@param[in] first_v_col if this is the first virtual column, which + has the version marker +@param[in,out] is_undo_log his function is used to parse both undo log, + and online log for virtual columns. So + check to see if this is undo log +@param[out] field_no the column number, or FIL_NULL if not indexed +@return remaining part of undo log record after reading these values */ +const byte* +trx_undo_read_v_idx( + const dict_table_t* table, + const byte* ptr, + bool first_v_col, + bool* is_undo_log, + uint32_t* field_no); + +/* Types of an undo log record: these have to be smaller than 16, as the +compilation info multiplied by 16 is ORed to this value in an undo log +record */ + +#define TRX_UNDO_RENAME_TABLE 9 /*!< RENAME TABLE */ +#define TRX_UNDO_INSERT_METADATA 10 /*!< insert a metadata + pseudo-record for instant ALTER */ +#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ +#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked + record */ +#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to + a not delete marked record; also the + fields of the record can change */ +#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields + do not change */ +#define TRX_UNDO_CMPL_INFO_MULT 16U /* compilation info is multiplied by + this and ORed to the type above */ +#define TRX_UNDO_UPD_EXTERN 128U /* This bit can be ORed to type_cmpl + to denote that we updated external + storage fields: used by purge to + free the external storage */ + +/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */ +extern const dtuple_t trx_undo_metadata; + +/** Read the table id from an undo log record. +@param[in] rec Undo log record +@return table id stored as a part of undo log record */ +inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec) +{ + rec+= 3; + mach_read_next_much_compressed(&rec); + return mach_read_next_much_compressed(&rec); +} + +#include "trx0rec.ic" + +#endif /* trx0rec_h */ diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic new file mode 100644 index 00000000..02244d68 --- /dev/null +++ b/storage/innobase/include/trx0rec.ic @@ -0,0 +1,73 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.ic +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Reads from an undo log record the record type. +@return record type */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1)); +} + +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +UNIV_INLINE +undo_no_t +trx_undo_rec_get_undo_no( +/*=====================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + const byte* ptr; + + ptr = undo_rec + 3; + + return(mach_u64_read_much_compressed(ptr)); +} + +/***********************************************************************//** +Copies the undo record to the heap. +@return own: copy of undo log record */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + mem_heap_t* heap) /*!< in: heap where copied */ +{ + ulint len; + + len = mach_read_from_2(undo_rec) + - ut_align_offset(undo_rec, srv_page_size); + ut_ad(len < srv_page_size); + trx_undo_rec_t* rec = static_cast<trx_undo_rec_t*>( + mem_heap_dup(heap, undo_rec, len)); + mach_write_to_2(rec, len); + return rec; +} diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h new file mode 100644 index 00000000..6a562dcb --- /dev/null +++ b/storage/innobase/include/trx0roll.h @@ -0,0 +1,187 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0roll.h +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0roll_h +#define trx0roll_h + +#include "trx0trx.h" +#include "mtr0mtr.h" +#include "trx0sys.h" + +extern bool trx_rollback_is_active; +extern const trx_t* trx_roll_crash_recv_trx; + +/*******************************************************************//** +Returns a transaction savepoint taken at this point in time. +@return savepoint */ +trx_savept_t +trx_savept_take( +/*============*/ + trx_t* trx); /*!< in: transaction */ + +/** Report progress when rolling back a row of a recovered transaction. */ +void trx_roll_report_progress(); +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +@param all true=roll back all recovered active transactions; +false=roll back any incomplete dictionary transaction */ +void +trx_rollback_recovered(bool all); +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. +@return a dummy parameter */ +extern "C" +os_thread_ret_t +DECLARE_THREAD(trx_rollback_all_recovered)(void*); +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +dberr_t +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +dberr_t +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +dberr_t +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +dberr_t +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +dberr_t +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*******************************************************************//** +Frees savepoint structs starting from savep. */ +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep); /*!< in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ +/** Rollback node states */ +enum roll_node_state { + ROLL_NODE_NONE = 0, /*!< Unknown state */ + ROLL_NODE_SEND, /*!< about to send a rollback signal to + the transaction */ + ROLL_NODE_WAIT /*!< rollback signal sent to the + transaction, waiting for completion */ +}; + +/** Rollback command node in a query graph */ +struct roll_node_t{ + que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */ + enum roll_node_state state; /*!< node execution state */ + const trx_savept_t* savept; /*!< savepoint to which to + roll back, in the case of a + partial rollback */ + que_thr_t* undo_thr;/*!< undo query graph */ +}; + +/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ +struct trx_named_savept_t{ + char* name; /*!< savepoint name */ + trx_savept_t savept; /*!< the undo number corresponding to + the savepoint */ + int64_t mysql_binlog_cache_pos; + /*!< the MySQL binlog cache position + corresponding to this savepoint, not + defined if the MySQL binlogging is not + enabled */ + UT_LIST_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< the list of savepoints of a + transaction */ +}; + +#endif diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h new file mode 100644 index 00000000..7e4511b8 --- /dev/null +++ b/storage/innobase/include/trx0rseg.h @@ -0,0 +1,277 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.h +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rseg_h +#define trx0rseg_h + +#include "trx0sys.h" +#include "fut0lst.h" + +/** Gets a rollback segment header. +@param[in] space space where placed +@param[in] page_no page number of the header +@param[in,out] mtr mini-transaction +@return rollback segment header, page x-latched */ +UNIV_INLINE +buf_block_t* +trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr); + +/** Gets a newly created rollback segment header. +@param[in] space space where placed +@param[in] page_no page number of the header +@param[in,out] mtr mini-transaction +@return rollback segment header, page x-latched */ +UNIV_INLINE +buf_block_t* +trx_rsegf_get_new( + ulint space, + uint32_t page_no, + mtr_t* mtr); + +/** Create a rollback segment header. +@param[in,out] space system, undo, or temporary tablespace +@param[in] rseg_id rollback segment identifier +@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg) +@param[in,out] mtr mini-transaction +@return the created rollback segment +@retval NULL on failure */ +buf_block_t* +trx_rseg_header_create( + fil_space_t* space, + ulint rseg_id, + buf_block_t* sys_header, + mtr_t* mtr); + +/** Initialize or recover the rollback segments at startup. */ +dberr_t trx_rseg_array_init(); + +/** Free a rollback segment in memory. */ +void +trx_rseg_mem_free(trx_rseg_t* rseg); + +/** Create a persistent rollback segment. +@param[in] space_id system or undo tablespace id +@return pointer to new rollback segment +@retval NULL on failure */ +trx_rseg_t* +trx_rseg_create(ulint space_id) + MY_ATTRIBUTE((warn_unused_result)); + +/** Create the temporary rollback segments. */ +void +trx_temp_rseg_create(); + +/* Number of undo log slots in a rollback segment file copy */ +#define TRX_RSEG_N_SLOTS (srv_page_size / 16) + +/* Maximum number of transactions supported by a single rollback segment */ +#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) + +/** The rollback segment memory object */ +struct trx_rseg_t { + /*--------------------------------------------------------*/ + /** rollback segment id == the index of its slot in the trx + system file copy */ + ulint id; + + /** mutex protecting the fields in this struct except id,space,page_no + which are constant */ + RsegMutex mutex; + + /** space where the rollback segment header is placed */ + fil_space_t* space; + + /** page number of the rollback segment header */ + uint32_t page_no; + + /** current size in pages */ + uint32_t curr_size; + + /*--------------------------------------------------------*/ + /* Fields for undo logs */ + /** List of undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) undo_list; + + /** List of undo log segments cached for fast reuse */ + UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached; + + /*--------------------------------------------------------*/ + + /** Last not yet purged undo log header; FIL_NULL if all purged */ + uint32_t last_page_no; + + /** trx_t::no | last_offset << 48 */ + uint64_t last_commit_and_offset; + + /** Whether the log segment needs purge */ + bool needs_purge; + + /** Reference counter to track rseg allocated transactions. */ + ulint trx_ref_count; + + /** If true, then skip allocating this rseg as it reside in + UNDO-tablespace marked for truncate. */ + bool skip_allocation; + + /** @return the commit ID of the last committed transaction */ + trx_id_t last_trx_no() const + { return last_commit_and_offset & ((1ULL << 48) - 1); } + /** @return header offset of the last committed transaction */ + uint16_t last_offset() const + { return static_cast<uint16_t>(last_commit_and_offset >> 48); } + + void set_last_commit(uint16_t last_offset, trx_id_t trx_no) + { + last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no; + } + + /** @return whether the rollback segment is persistent */ + bool is_persistent() const + { + ut_ad(space == fil_system.temp_space + || space == fil_system.sys_space + || (srv_undo_space_id_start > 0 + && space->id >= srv_undo_space_id_start + && space->id <= srv_undo_space_id_start + + TRX_SYS_MAX_UNDO_SPACES)); + ut_ad(space == fil_system.temp_space + || space == fil_system.sys_space + || (srv_undo_space_id_start > 0 + && space->id >= srv_undo_space_id_start + && space->id <= srv_undo_space_id_start + + srv_undo_tablespaces_open) + || !srv_was_started); + return(space->id != SRV_TMP_SPACE_ID); + } +}; + +/* Undo log segment slot in a rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of + an undo log segment */ +/*-------------------------------------------------------------*/ +/* Slot size */ +#define TRX_RSEG_SLOT_SIZE 4 + +/* The offset of the rollback segment header on its page */ +#define TRX_RSEG FSEG_PAGE_DATA + +/* Transaction rollback segment header */ +/*-------------------------------------------------------------*/ +/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */ +#define TRX_RSEG_FORMAT 0 +/** Number of pages in the TRX_RSEG_HISTORY list */ +#define TRX_RSEG_HISTORY_SIZE 4 +/** Committed transaction logs that have not been purged yet */ +#define TRX_RSEG_HISTORY 8 +#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) + /* Header for the file segment where + this page is placed */ +#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) + /* Undo log segment slots */ +/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */ +#define TRX_RSEG_MAX_TRX_ID (TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS \ + * TRX_RSEG_SLOT_SIZE) + +/** 8 bytes offset within the binlog file */ +#define TRX_RSEG_BINLOG_OFFSET TRX_RSEG_MAX_TRX_ID + 8 +/** MySQL log file name, 512 bytes, including terminating NUL +(valid only if TRX_RSEG_FORMAT is 0). +If no binlog information is present, the first byte is NUL. */ +#define TRX_RSEG_BINLOG_NAME TRX_RSEG_MAX_TRX_ID + 16 +/** Maximum length of binlog file name, including terminating NUL, in bytes */ +#define TRX_RSEG_BINLOG_NAME_LEN 512 + +#ifdef WITH_WSREP +/** The offset to WSREP XID headers */ +#define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512 + +/** WSREP XID format (1 if present and valid, 0 if not present) */ +#define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO +/** WSREP XID GTRID length */ +#define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4 +/** WSREP XID bqual length */ +#define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8 +/** WSREP XID data (XIDDATASIZE bytes) */ +#define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12 +#endif /* WITH_WSREP*/ + +/*-------------------------------------------------------------*/ + +/** Read the page number of an undo log slot. +@param[in] rseg_header rollback segment header +@param[in] n slot number */ +inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n) +{ + ut_ad(n < TRX_RSEG_N_SLOTS); + return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE + rseg_header->frame); +} + +#ifdef WITH_WSREP +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + buf_block_t* rseg_header, + const XID* xid, + mtr_t* mtr); + +/** Update WSREP checkpoint XID in first rollback segment header +as part of wsrep_set_SE_checkpoint() when it is guaranteed that there +are no wsrep transactions committing. +If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already +stored into rollback segments, the WSREP XID in all the remaining rollback +segments will be reset. +@param[in] xid WSREP XID */ +void trx_rseg_update_wsrep_checkpoint(const XID* xid); + +/** Recover the latest WSREP checkpoint XID. +@param[out] xid WSREP XID +@return whether the WSREP XID was found */ +bool trx_rseg_read_wsrep_checkpoint(XID& xid); +#endif /* WITH_WSREP */ + +/** Upgrade a rollback segment header page to MariaDB 10.3 format. +@param[in,out] rseg_header rollback segment header page +@param[in,out] mtr mini-transaction */ +void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr); + +/** Update the offset information about the end of the binlog entry +which corresponds to the transaction just being committed. +In a replication slave, this updates the master binlog position +up to which replication has proceeded. +@param[in,out] rseg_header rollback segment header +@param[in] trx committing transaction +@param[in,out] mtr mini-transaction */ +void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx, + mtr_t *mtr); + +#include "trx0rseg.ic" + +#endif diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic new file mode 100644 index 00000000..b293d9f1 --- /dev/null +++ b/storage/innobase/include/trx0rseg.ic @@ -0,0 +1,72 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.ic +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "srv0srv.h" +#include "mtr0log.h" + +/** Gets a rollback segment header. +@param[in] space space where placed +@param[in] page_no page number of the header +@param[in,out] mtr mini-transaction +@return rollback segment header, page x-latched */ +UNIV_INLINE +buf_block_t* +trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr) +{ + ut_ad(space == fil_system.sys_space || space == fil_system.temp_space + || srv_is_undo_tablespace(space->id) + || !srv_was_started); + + buf_block_t* block = buf_page_get(page_id_t(space->id, page_no), + 0, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); + return block; +} + +/** Gets a newly created rollback segment header. +@param[in] space space where placed +@param[in] page_no page number of the header +@param[in,out] mtr mini-transaction +@return rollback segment header, page x-latched */ +UNIV_INLINE +buf_block_t* +trx_rsegf_get_new( + ulint space, + uint32_t page_no, + mtr_t* mtr) +{ + buf_block_t* block; + + ut_ad(space <= srv_undo_tablespaces_active || space == SRV_TMP_SPACE_ID + || !srv_was_started); + ut_ad(space <= TRX_SYS_MAX_UNDO_SPACES || space == SRV_TMP_SPACE_ID); + + block = buf_page_get(page_id_t(space, page_no), 0, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + return block; +} diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h new file mode 100644 index 00000000..424e4447 --- /dev/null +++ b/storage/innobase/include/trx0sys.h @@ -0,0 +1,1235 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0sys.h +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0sys_h +#define trx0sys_h + +#include "buf0buf.h" +#include "fil0fil.h" +#include "trx0types.h" +#include "mem0mem.h" +#include "mtr0mtr.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "read0types.h" +#include "page0types.h" +#include "ut0mutex.h" +#include "trx0trx.h" +#ifdef WITH_WSREP +#include "trx0xa.h" +#endif /* WITH_WSREP */ +#include "ilist.h" + +/** Checks if a page address is the trx sys header page. +@param[in] page_id page id +@return true if trx sys header page */ +inline bool trx_sys_hdr_page(const page_id_t page_id) +{ + return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO); +} + +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +void +trx_sys_create_sys_pages(void); +/*==========================*/ +/** Find an available rollback segment. +@param[in] sys_header +@return an unallocated rollback segment slot in the TRX_SYS header +@retval ULINT_UNDEFINED if not found */ +ulint +trx_sys_rseg_find_free(const buf_block_t* sys_header); +/** Request the TRX_SYS page. +@param[in] rw whether to lock the page for writing +@return the TRX_SYS page +@retval NULL if the page cannot be read */ +inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true) +{ + buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr); + ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);) + return block; +} + +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +extern uint trx_rseg_n_slots_debug; +#endif + +/** Write DB_TRX_ID. +@param[out] db_trx_id the DB_TRX_ID field to be written to +@param[in] id transaction ID */ +UNIV_INLINE +void +trx_write_trx_id(byte* db_trx_id, trx_id_t id) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + mach_write_to_6(db_trx_id, id); +} + +/** Read a transaction identifier. +@return id */ +inline +trx_id_t +trx_read_trx_id(const byte* ptr) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + return(mach_read_from_6(ptr)); +} + +#ifdef UNIV_DEBUG +/** Check that the DB_TRX_ID in a record is valid. +@param[in] db_trx_id the DB_TRX_ID column to validate +@param[in] trx_id the id of the ALTER TABLE transaction */ +inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id) +{ + trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id)); + ut_ad(id == 0 || id > trx_id); + return true; +} +#endif + +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/*!< in: MySQL log file name */ + int64_t offset, /*!< in: position in that log file */ + buf_block_t* sys_header, /*!< in,out: trx sys header */ + mtr_t* mtr); /*!< in,out: mini-transaction */ +/** Display the MySQL binlog offset info if it is present in the trx +system header. */ +void +trx_sys_print_mysql_binlog_offset(); + +/** Create the rollback segments. +@return whether the creation succeeded */ +bool +trx_sys_create_rsegs(); + +/** The automatically created system rollback segment has this id */ +#define TRX_SYS_SYSTEM_RSEG_ID 0 + +/** The offset of the transaction system header on the page */ +#define TRX_SYS FSEG_PAGE_DATA + +/** Transaction system header */ +/*------------------------------------------------------------- @{ */ +/** In old versions of InnoDB, this persisted the value of +trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5, +the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages +and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages +are used instead. The field only exists for the purpose of upgrading +from older MySQL or MariaDB versions. */ +#define TRX_SYS_TRX_ID_STORE 0 +#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the + tablespace segment the trx + system is created into */ +#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE) + /*!< the start of the array of + rollback segment specification + slots */ +/*------------------------------------------------------------- @} */ + +/** The number of rollback segments; rollback segment id must fit in +the 7 bits reserved for it in DB_ROLL_PTR. */ +#define TRX_SYS_N_RSEGS 128 +/** Maximum number of undo tablespaces (not counting the system tablespace) */ +#define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1) + +/* Rollback segment specification slot offsets */ + +/** the tablespace ID of an undo log header; starting with +MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */ +#define TRX_SYS_RSEG_SPACE 0 +/** the page number of an undo log header, or FIL_NULL if unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 +/** Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/** Read the tablespace ID of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo tablespace id */ +inline +uint32_t +trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); +} + +/** Read the page number of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo page number */ +inline uint32_t +trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); +} + +/** Maximum length of MySQL binlog file name, in bytes. +(Used before MariaDB 10.3.5.) */ +#define TRX_SYS_MYSQL_LOG_NAME_LEN 512 +/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */ +#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 + +#if UNIV_PAGE_SIZE_MIN < 4096 +# error "UNIV_PAGE_SIZE_MIN < 4096" +#endif +/** The offset of the MySQL binlog offset info in the trx system header */ +#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000) +#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is + TRX_SYS_MYSQL_LOG_MAGIC_N + if we have valid data in the + MySQL binlog info */ +#define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset + within that file */ +#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */ + +/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096 + +0...37 FIL_HEADER +38...45 TRX_SYS_TRX_ID_STORE +46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10) +56 TRX_SYS_RSEGS + 56...59 TRX_SYS_RSEG_SPACE for slot 0 + 60...63 TRX_SYS_RSEG_PAGE_NO for slot 0 + 64...67 TRX_SYS_RSEG_SPACE for slot 1 + 68...71 TRX_SYS_RSEG_PAGE_NO for slot 1 +.... + 594..597 TRX_SYS_RSEG_SPACE for slot 72 + 598..601 TRX_SYS_RSEG_PAGE_NO for slot 72 +... + ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126 + +(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace +space_id, page_no pairs :::) +596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD +600 TRX_SYS_WSREP_XID_FORMAT +604 TRX_SYS_WSREP_XID_GTRID_LEN +608 TRX_SYS_WSREP_XID_BQUAL_LEN +612 TRX_SYS_WSREP_XID_DATA (len = 128) +739 TRX_SYS_WSREP_XID_DATA_END + +FIXED WSREP XID info offsets for 4k page size 10.0.32-galera +(srv_page_size-2500) +1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD +1600 TRX_SYS_WSREP_XID_FORMAT +1604 TRX_SYS_WSREP_XID_GTRID_LEN +1608 TRX_SYS_WSREP_XID_BQUAL_LEN +1612 TRX_SYS_WSREP_XID_DATA (len = 128) +1739 TRX_SYS_WSREP_XID_DATA_END + +(srv_page_size - 2000 MYSQL MASTER LOG) +2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD +2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH +2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW +2108 TRX_SYS_MYSQL_LOG_NAME + +(srv_page_size - 1000 MYSQL LOG) +3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD +3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH +3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW +3108 TRX_SYS_MYSQL_LOG_NAME + +(srv_page_size - 200 DOUBLEWRITE) +3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG +3906 TRX_SYS_DOUBLEWRITE_MAGIC +3910 TRX_SYS_DOUBLEWRITE_BLOCK1 +3914 TRX_SYS_DOUBLEWRITE_BLOCK2 +3918 TRX_SYS_DOUBLEWRITE_REPEAT +3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N + +(srv_page_size - 8, TAILER) +4088..4096 FIL_TAILER + +*/ +#ifdef WITH_WSREP +/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */ +#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL) +#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0 +#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265 + +/** XID field: formatID, gtrid_len, bqual_len, xid_data */ +#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE) +#define TRX_SYS_WSREP_XID_FORMAT 4 +#define TRX_SYS_WSREP_XID_GTRID_LEN 8 +#define TRX_SYS_WSREP_XID_BQUAL_LEN 12 +#define TRX_SYS_WSREP_XID_DATA 16 +#endif /* WITH_WSREP*/ + +/** Doublewrite buffer */ +/* @{ */ +/** The offset of the doublewrite buffer header on the trx system header page */ +#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200) +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg + containing the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE + /*!< 4-byte magic number which + shows if we already have + created the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the first + sequence of 64 + (= FSP_EXTENT_SIZE) consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the second + sequence of 64 consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_BLOCK1, + TRX_SYS_DOUBLEWRITE_BLOCK2 + so that if the trx sys + header is half-written + to disk, we still may + be able to recover the + information */ +/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, +we must reset the doublewrite buffer, because starting from 4.1.x the +space id of a data page is stored into +FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) + +/*-------------------------------------------------------------*/ +/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */ +constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855; +/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */ +constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386; +/* @} */ + +trx_t* current_trx(); + +struct rw_trx_hash_element_t +{ + rw_trx_hash_element_t(): trx(0) + { + mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex); + } + + + ~rw_trx_hash_element_t() + { + mutex_free(&mutex); + } + + + trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ + + /** + Transaction serialization number. + + Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY + state. Initially set to TRX_ID_MAX. + */ + Atomic_counter<trx_id_t> no; + trx_t *trx; + ib_mutex_t mutex; +}; + + +/** + Wrapper around LF_HASH to store set of in memory read-write transactions. +*/ + +class rw_trx_hash_t +{ + LF_HASH hash; + + + template <typename T> + using walk_action= my_bool(rw_trx_hash_element_t *element, T *action); + + + /** + Constructor callback for lock-free allocator. + + Object is just allocated and is not yet accessible via rw_trx_hash by + concurrent threads. Object can be reused multiple times before it is freed. + Every time object is being reused initializer() callback is called. + */ + + static void rw_trx_hash_constructor(uchar *arg) + { + new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + Object is about to be freed and is not accessible via rw_trx_hash by + concurrent threads. + */ + + static void rw_trx_hash_destructor(uchar *arg) + { + reinterpret_cast<rw_trx_hash_element_t*> + (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + This destructor is used at shutdown. It frees remaining transaction + objects. + + XA PREPARED transactions may remain if they haven't been committed or + rolled back. ACTIVE transactions may remain if startup was interrupted or + server is running in read-only mode or for certain srv_force_recovery + levels. + */ + + static void rw_trx_hash_shutdown_destructor(uchar *arg) + { + rw_trx_hash_element_t *element= + reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD); + if (trx_t *trx= element->trx) + { + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) || + (trx_state_eq(trx, TRX_STATE_ACTIVE) && + (!srv_was_started || + srv_read_only_mode || + srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO))); + trx_free_at_shutdown(trx); + } + element->~rw_trx_hash_element_t(); + } + + + /** + Initializer callback for lock-free hash. + + Object is not yet accessible via rw_trx_hash by concurrent threads, but is + about to become such. Object id can be changed only by this callback and + remains the same until all pins to this object are released. + + Object trx can be changed to 0 by erase() under object mutex protection, + which indicates it is about to be removed from lock-free hash and become + not accessible by concurrent threads. + */ + + static void rw_trx_hash_initializer(LF_HASH *, + rw_trx_hash_element_t *element, + trx_t *trx) + { + ut_ad(element->trx == 0); + element->trx= trx; + element->id= trx->id; + element->no= TRX_ID_MAX; + trx->rw_trx_hash_element= element; + } + + + /** + Gets LF_HASH pins. + + Pins are used to protect object from being destroyed or reused. They are + normally stored in trx object for quick access. If caller doesn't have trx + available, we try to get it using currnet_trx(). If caller doesn't have trx + at all, temporary pins are allocated. + */ + + LF_PINS *get_pins(trx_t *trx) + { + if (!trx->rw_trx_hash_pins) + { + trx->rw_trx_hash_pins= lf_hash_get_pins(&hash); + ut_a(trx->rw_trx_hash_pins); + } + return trx->rw_trx_hash_pins; + } + + + template <typename T> struct eliminate_duplicates_arg + { + trx_ids_t ids; + walk_action<T> *action; + T *argument; + eliminate_duplicates_arg(size_t size, walk_action<T> *act, T *arg): + action(act), argument(arg) { ids.reserve(size); } + }; + + + template <typename T> + static my_bool eliminate_duplicates(rw_trx_hash_element_t *element, + eliminate_duplicates_arg<T> *arg) + { + for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++) + { + if (*it == element->id) + return 0; + } + arg->ids.push_back(element->id); + return arg->action(element, arg->argument); + } + + +#ifdef UNIV_DEBUG + static void validate_element(trx_t *trx) + { + ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg); + ut_ad(!trx->is_autocommit_non_locking()); + /* trx->state can be anything except TRX_STATE_NOT_STARTED */ + mutex_enter(&trx->mutex); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + mutex_exit(&trx->mutex); + } + + + template <typename T> struct debug_iterator_arg + { + walk_action<T> *action; + T *argument; + }; + + + template <typename T> + static my_bool debug_iterator(rw_trx_hash_element_t *element, + debug_iterator_arg<T> *arg) + { + mutex_enter(&element->mutex); + if (element->trx) + validate_element(element->trx); + mutex_exit(&element->mutex); + return arg->action(element, arg->argument); + } +#endif + + +public: + void init() + { + lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0, + sizeof(trx_id_t), 0, &my_charset_bin); + hash.alloc.constructor= rw_trx_hash_constructor; + hash.alloc.destructor= rw_trx_hash_destructor; + hash.initializer= + reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer); + } + + + void destroy() + { + hash.alloc.destructor= rw_trx_hash_shutdown_destructor; + lf_hash_destroy(&hash); + } + + + /** + Releases LF_HASH pins. + + Must be called by thread that owns trx_t object when the latter is being + "detached" from thread (e.g. released to the pool by trx_t::free()). Can be + called earlier if thread is expected not to use rw_trx_hash. + + Since pins are not allowed to be transferred to another thread, + initialisation thread calls this for recovered transactions. + */ + + void put_pins(trx_t *trx) + { + if (trx->rw_trx_hash_pins) + { + lf_hash_put_pins(trx->rw_trx_hash_pins); + trx->rw_trx_hash_pins= 0; + } + } + + + /** + Finds trx object in lock-free hash with given id. + + Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless + the transaction may get committed before this method returns. + + With do_ref_count == false the caller may dereference returned trx pointer + only if lock_sys.mutex was acquired before calling find(). + + With do_ref_count == true caller may dereference trx even if it is not + holding lock_sys.mutex. Caller is responsible for calling + trx->release_reference() when it is done playing with trx. + + Ideally this method should get caller rw_trx_hash_pins along with trx + object as a parameter, similar to insert() and erase(). However most + callers lose trx early in their call chains and it is not that easy to pass + them through. + + So we take more expensive approach: get trx through current_thd()->ha_data. + Some threads don't have trx attached to THD, and at least server + initialisation thread, fts_optimize_thread, srv_master_thread, + dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even + have THD at all. For such cases we allocate pins only for duration of + search and free them immediately. + + This has negative performance impact and should be fixed eventually (by + passing caller_trx as a parameter). Still stream of DML is more or less Ok. + + @return + @retval 0 not found + @retval pointer to trx + */ + + trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count) + { + /* + In MariaDB 10.3, purge will reset DB_TRX_ID to 0 + when the history is lost. Read/write transactions will + always have a nonzero trx_t::id; there the value 0 is + reserved for transactions that did not write or lock + anything yet. + + The caller should already have handled trx_id==0 specially. + */ + ut_ad(trx_id); + ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count); + + trx_t *trx= 0; + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); + + rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*> + (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id), + sizeof(trx_id_t))); + if (element) + { + mutex_enter(&element->mutex); + lf_hash_search_unpin(pins); + if ((trx= element->trx)) { + DBUG_ASSERT(trx_id == trx->id); + ut_d(validate_element(trx)); + if (do_ref_count) + { + /* + We have an early state check here to avoid committer + starvation in a wait loop for transaction references, + when there's a stream of trx_sys.find() calls from other + threads. The trx->state may change to COMMITTED after + trx->mutex is released, and it will have to be rechecked + by the caller after reacquiring the mutex. + */ + trx_mutex_enter(trx); + const trx_state_t state= trx->state; + trx_mutex_exit(trx); + if (state == TRX_STATE_COMMITTED_IN_MEMORY) + trx= NULL; + else + trx->reference(); + } + } + mutex_exit(&element->mutex); + } + if (!caller_trx) + lf_hash_put_pins(pins); + return trx; + } + + + /** + Inserts trx to lock-free hash. + + Object becomes accessible via rw_trx_hash. + */ + + void insert(trx_t *trx) + { + ut_d(validate_element(trx)); + int res= lf_hash_insert(&hash, get_pins(trx), + reinterpret_cast<void*>(trx)); + ut_a(res == 0); + } + + + /** + Removes trx from lock-free hash. + + Object becomes not accessible via rw_trx_hash. But it still can be pinned + by concurrent find(), which is supposed to release it immediately after + it sees object trx is 0. + */ + + void erase(trx_t *trx) + { + ut_d(validate_element(trx)); + mutex_enter(&trx->rw_trx_hash_element->mutex); + trx->rw_trx_hash_element->trx= 0; + mutex_exit(&trx->rw_trx_hash_element->mutex); + int res= lf_hash_delete(&hash, get_pins(trx), + reinterpret_cast<const void*>(&trx->id), + sizeof(trx_id_t)); + ut_a(res == 0); + } + + + /** + Returns the number of elements in the hash. + + The number is exact only if hash is protected against concurrent + modifications (e.g. single threaded startup or hash is protected + by some mutex). Otherwise the number may be used as a hint only, + because it may change even before this method returns. + */ + + uint32_t size() { return uint32_t(lf_hash_size(&hash)); } + + + /** + Iterates the hash. + + @param caller_trx used to get/set pins + @param action called for every element in hash + @param argument opque argument passed to action + + May return the same element multiple times if hash is under contention. + If caller doesn't like to see the same transaction multiple times, it has + to call iterate_no_dups() instead. + + May return element with committed transaction. If caller doesn't like to + see committed transactions, it has to skip those under element mutex: + + mutex_enter(&element->mutex); + if (trx_t trx= element->trx) + { + // trx is protected against commit in this branch + } + mutex_exit(&element->mutex); + + May miss concurrently inserted transactions. + + @return + @retval 0 iteration completed successfully + @retval 1 iteration was interrupted (action returned 1) + */ + + template <typename T> + int iterate(trx_t *caller_trx, walk_action<T> *action, T *argument= nullptr) + { + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); +#ifdef UNIV_DEBUG + debug_iterator_arg<T> debug_arg= { action, argument }; + action= reinterpret_cast<decltype(action)>(debug_iterator<T>); + argument= reinterpret_cast<T*>(&debug_arg); +#endif + int res= lf_hash_iterate(&hash, pins, + reinterpret_cast<my_hash_walk_action>(action), + const_cast<void*>(static_cast<const void*> + (argument))); + if (!caller_trx) + lf_hash_put_pins(pins); + return res; + } + + + template <typename T> + int iterate(walk_action<T> *action, T *argument= nullptr) + { + return iterate(current_trx(), action, argument); + } + + + /** + Iterates the hash and eliminates duplicate elements. + + @sa iterate() + */ + + template <typename T> + int iterate_no_dups(trx_t *caller_trx, walk_action<T> *action, + T *argument= nullptr) + { + eliminate_duplicates_arg<T> arg(size() + 32, action, argument); + return iterate(caller_trx, eliminate_duplicates<T>, &arg); + } + + + template <typename T> + int iterate_no_dups(walk_action<T> *action, T *argument= nullptr) + { + return iterate_no_dups(current_trx(), action, argument); + } +}; + +class thread_safe_trx_ilist_t +{ +public: + void create() { mutex_create(LATCH_ID_TRX_SYS, &mutex); } + void close() { mutex_free(&mutex); } + + bool empty() const + { + mutex_enter(&mutex); + auto result= trx_list.empty(); + mutex_exit(&mutex); + return result; + } + + void push_front(trx_t &trx) + { + mutex_enter(&mutex); + trx_list.push_front(trx); + mutex_exit(&mutex); + } + + void remove(trx_t &trx) + { + mutex_enter(&mutex); + trx_list.remove(trx); + mutex_exit(&mutex); + } + + template <typename Callable> void for_each(Callable &&callback) const + { + mutex_enter(&mutex); + for (const auto &trx : trx_list) + callback(trx); + mutex_exit(&mutex); + } + + template <typename Callable> void for_each(Callable &&callback) + { + mutex_enter(&mutex); + for (auto &trx : trx_list) + callback(trx); + mutex_exit(&mutex); + } + + void freeze() const { mutex_enter(&mutex); } + void unfreeze() const { mutex_exit(&mutex); } + +private: + alignas(CACHE_LINE_SIZE) mutable TrxSysMutex mutex; + alignas(CACHE_LINE_SIZE) ilist<trx_t> trx_list; +}; + +/** The transaction system central memory data structure. */ +class trx_sys_t +{ + /** + The smallest number not yet assigned as a transaction id or transaction + number. Accessed and updated with atomic operations. + */ + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id; + + + /** + Solves race conditions between register_rw() and snapshot_ids() as well as + race condition between assign_new_trx_no() and snapshot_ids(). + + @sa register_rw() + @sa assign_new_trx_no() + @sa snapshot_ids() + */ + MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version; + + + bool m_initialised; + +public: + /** + TRX_RSEG_HISTORY list length (number of committed transactions to purge) + */ + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len; + + /** List of all transactions. */ + thread_safe_trx_ilist_t trx_list; + + MY_ALIGNED(CACHE_LINE_SIZE) + /** Temporary rollback segments */ + trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS]; + + MY_ALIGNED(CACHE_LINE_SIZE) + trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS]; + /*!< Pointer array to rollback + segments; NULL if slot not in use; + created and destroyed in + single-threaded mode; not protected + by any mutex, because it is read-only + during multi-threaded operation */ + + /** + Lock-free hash of in memory read-write transactions. + Works faster when it is on it's own cache line (tested). + */ + + MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash; + + +#ifdef WITH_WSREP + /** Latest recovered XID during startup */ + XID recovered_wsrep_xid; +#endif + /** Latest recovered binlog offset */ + uint64_t recovered_binlog_offset; + /** Latest recovered binlog file name */ + char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN]; + /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */ + lsn_t recovered_binlog_lsn; + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + + trx_sys_t(): m_initialised(false) {} + + + /** + Returns the minimum trx id in rw trx list. + + This is the smallest id for which the trx can possibly be active. (But, you + must look at the trx->state to find out if the minimum trx id transaction + itself is active, or already committed.) + + @return the minimum trx id, or m_max_trx_id if the trx list is empty + */ + + trx_id_t get_min_trx_id() + { + trx_id_t id= get_max_trx_id(); + rw_trx_hash.iterate(get_min_trx_id_callback, &id); + return id; + } + + + /** + Determines the maximum transaction id. + + @return maximum currently allocated trx id; will be stale after the + next call to trx_sys.get_new_trx_id() + */ + + trx_id_t get_max_trx_id() + { + return m_max_trx_id; + } + + + /** + Allocates a new transaction id. + @return new, allocated trx id + */ + + trx_id_t get_new_trx_id() + { + trx_id_t id= get_new_trx_id_no_refresh(); + refresh_rw_trx_hash_version(); + return id; + } + + + /** + Allocates and assigns new transaction serialisation number. + + There's a gap between m_max_trx_id increment and transaction serialisation + number becoming visible through rw_trx_hash. While we're in this gap + concurrent thread may come and do MVCC snapshot without seeing allocated + but not yet assigned serialisation number. Then at some point purge thread + may clone this view. As a result it won't see newly allocated serialisation + number and may remove "unnecessary" history data of this transaction from + rollback segments. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transaction serialisation numbers up to m_max_trx_id are + available through rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after + trx->rw_trx_hash_element->no becomes visible through rw_trx_hash. + + @param trx transaction + */ + void assign_new_trx_no(trx_t *trx) + { + trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh(); + refresh_rw_trx_hash_version(); + } + + + /** + Takes MVCC snapshot. + + To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements + in ids. + + For details about get_rw_trx_hash_version() != get_max_trx_id() spin + @sa register_rw() and @sa assign_new_trx_no(). + + We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so + that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash. + + To optimise snapshot creation rw_trx_hash.iterate() is being used instead + of rw_trx_hash.iterate_no_dups(). It means that some transaction + identifiers may appear multiple times in ids. + + @param[in,out] caller_trx used to get access to rw_trx_hash_pins + @param[out] ids array to store registered transaction identifiers + @param[out] max_trx_id variable to store m_max_trx_id value + @param[out] mix_trx_no variable to store min(no) value + */ + + void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id, + trx_id_t *min_trx_no) + { + snapshot_ids_arg arg(ids); + + while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id()) + ut_delay(1); + arg.m_no= arg.m_id; + + ids->clear(); + ids->reserve(rw_trx_hash.size() + 32); + rw_trx_hash.iterate(caller_trx, copy_one_id, &arg); + + *max_trx_id= arg.m_id; + *min_trx_no= arg.m_no; + } + + + /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */ + void init_max_trx_id(trx_id_t value) + { + m_max_trx_id= value; + m_rw_trx_hash_version.store(value, std::memory_order_relaxed); + } + + + bool is_initialised() { return m_initialised; } + + + /** Initialise the transaction subsystem. */ + void create(); + + /** Close the transaction subsystem on shutdown. */ + void close(); + + /** @return total number of active (non-prepared) transactions */ + ulint any_active_transactions(); + + + /** + Registers read-write transaction. + + Transaction becomes visible to MVCC. + + There's a gap between m_max_trx_id increment and transaction becoming + visible through rw_trx_hash. While we're in this gap concurrent thread may + come and do MVCC snapshot. As a result concurrent read view will be able to + observe records owned by this transaction even before it was committed. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transactions up to m_max_trx_id are available through + rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after transaction becomes + visible through rw_trx_hash. + */ + + void register_rw(trx_t *trx) + { + trx->id= get_new_trx_id_no_refresh(); + rw_trx_hash.insert(trx); + refresh_rw_trx_hash_version(); + } + + + /** + Deregisters read-write transaction. + + Transaction is removed from rw_trx_hash, which releases all implicit locks. + MVCC snapshot won't see this transaction anymore. + */ + + void deregister_rw(trx_t *trx) + { + rw_trx_hash.erase(trx); + } + + + bool is_registered(trx_t *caller_trx, trx_id_t id) + { + return id && find(caller_trx, id, false); + } + + + trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true) + { + return rw_trx_hash.find(caller_trx, id, do_ref_count); + } + + + /** + Registers transaction in trx_sys. + + @param trx transaction + */ + void register_trx(trx_t *trx) + { + trx_list.push_front(*trx); + } + + + /** + Deregisters transaction in trx_sys. + + @param trx transaction + */ + void deregister_trx(trx_t *trx) + { + trx_list.remove(*trx); + } + + + /** + Clones the oldest view and stores it in view. + + No need to call ReadView::close(). The caller owns the view that is passed + in. This function is called by purge thread to determine whether it should + purge the delete marked record or not. + */ + void clone_oldest_view(ReadViewBase *view) const; + + + /** @return the number of active views */ + size_t view_count() const + { + size_t count= 0; + + trx_list.for_each([&count](const trx_t &trx) { + if (trx.read_view.is_open()) + ++count; + }); + + return count; + } + +private: + static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element, + trx_id_t *id) + { + if (element->id < *id) + { + mutex_enter(&element->mutex); + /* We don't care about read-only transactions here. */ + if (element->trx && element->trx->rsegs.m_redo.rseg) + *id= element->id; + mutex_exit(&element->mutex); + } + return 0; + } + + + struct snapshot_ids_arg + { + snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {} + trx_ids_t *m_ids; + trx_id_t m_id; + trx_id_t m_no; + }; + + + static my_bool copy_one_id(rw_trx_hash_element_t *element, + snapshot_ids_arg *arg) + { + if (element->id < arg->m_id) + { + trx_id_t no= element->no; + arg->m_ids->push_back(element->id); + if (no < arg->m_no) + arg->m_no= no; + } + return 0; + } + + + /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ + trx_id_t get_rw_trx_hash_version() + { + return m_rw_trx_hash_version.load(std::memory_order_acquire); + } + + + /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */ + void refresh_rw_trx_hash_version() + { + m_rw_trx_hash_version.fetch_add(1, std::memory_order_release); + } + + + /** + Allocates new transaction id without refreshing rw_trx_hash version. + + This method is extracted for exclusive use by register_rw() and + assign_new_trx_no() where new id must be allocated atomically with + payload of these methods from MVCC snapshot point of view. + + @sa get_new_trx_id() + @sa assign_new_trx_no() + + @return new transaction id + */ + + trx_id_t get_new_trx_id_no_refresh() + { + return m_max_trx_id++; + } +}; + + +/** The transaction system */ +extern trx_sys_t trx_sys; + +#endif diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h new file mode 100644 index 00000000..09132e7c --- /dev/null +++ b/storage/innobase/include/trx0trx.h @@ -0,0 +1,1126 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.h +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0trx_h +#define trx0trx_h + +#include "trx0types.h" +#include "lock0types.h" +#include "que0types.h" +#include "mem0mem.h" +#include "trx0xa.h" +#include "ut0vec.h" +#include "fts0fts.h" +#include "read0types.h" +#include "ilist.h" + +#include <vector> +#include <set> + +// Forward declaration +struct mtr_t; +struct rw_trx_hash_element_t; + +/******************************************************************//** +Set detailed error message for the transaction. */ +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg); /*!< in: detailed error message */ +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file); /*!< in: file to read message from */ +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx); /*!< in: trx object */ + +/** @return an allocated transaction */ +trx_t *trx_create(); + +/** At shutdown, frees a transaction object. */ +void trx_free_at_shutdown(trx_t *trx); + +/** Disconnect a prepared transaction from MySQL. +@param[in,out] trx transaction */ +void trx_disconnect_prepared(trx_t *trx); + +/** Initialize (resurrect) transactions at startup. */ +dberr_t trx_lists_init_at_db_start(); + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + bool read_write); /*!< in: true if read write transaction */ +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + bool read_write); /*!< in: true if read write transaction */ + +/*************************************************************//** +Starts a transaction for internal processing. */ +void +trx_start_internal_low( +/*===================*/ + trx_t* trx); /*!< in/out: transaction */ + +/** Starts a read-only transaction for internal processing. +@param[in,out] trx transaction to be started */ +void +trx_start_internal_read_only_low( + trx_t* trx); + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started_xa(t, rw) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_xa_low((t), rw); \ + } while (false) + +#define trx_start_if_not_started(t, rw) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_low((t), rw); \ + } while (false) + +#define trx_start_internal(t) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_internal_low((t)); \ + } while (false) + +#define trx_start_internal_read_only(t) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_internal_read_only_low(t); \ + } while (false) +#else +#define trx_start_if_not_started(t, rw) \ + trx_start_if_not_started_low((t), rw) + +#define trx_start_internal(t) \ + trx_start_internal_low((t)) + +#define trx_start_internal_read_only(t) \ + trx_start_internal_read_only_low(t) + +#define trx_start_if_not_started_xa(t, rw) \ + trx_start_if_not_started_xa_low((t), (rw)) +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op); /*!< in: dictionary operation type */ + +#ifdef UNIV_DEBUG +#define trx_start_for_ddl(t, o) \ + do { \ + ut_ad((t)->start_file == 0); \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_for_ddl_low((t), (o)); \ + } while (0) +#else +#define trx_start_for_ddl(t, o) \ + trx_start_for_ddl_low((t), (o)) +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx); /*!< in/out: transaction */ +/** XA PREPARE a transaction. +@param[in,out] trx transaction to prepare */ +void trx_prepare_for_mysql(trx_t* trx); +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions */ +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + uint len); /*!< in: number of slots in xid_list */ +/** Look up an X/Open distributed transaction in XA PREPARE state. +@param[in] xid X/Open XA transaction identifier +@return transaction on match (the trx_t::xid will be invalidated); +note that the trx may have been committed before the caller acquires +trx_t::mutex +@retval NULL if no match */ +trx_t* trx_get_trx_by_xid(const XID* xid); +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx); /*!< in/out: transaction */ +/**********************************************************************//** +Marks the latest SQL statement ended. */ +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx); /*!< in: trx handle */ +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +Prints info about a transaction. */ +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: lock_number_of_rows_locked(&trx->lock) */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size); + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ + +/**********************************************************************//** +Prints info about a transaction. +When possible, use trx_print() instead. */ +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len); /*!< in: max query length to print, + or 0 to use the default max length */ + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys.mutex. */ +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len); /*!< in: max query length to print, + or 0 to use the default max length */ + +/**********************************************************************//** +Determine if a transaction is a dictionary operation. +@return dictionary operation mode */ +UNIV_INLINE +enum trx_dict_op_t +trx_get_dict_operation( +/*===================*/ + const trx_t* trx) /*!< in: transaction */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + enum trx_dict_op_t op); /*!< in: operation, not + TRX_DICT_OP_NONE */ + +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx->mutex, or it must be the thread +that is serving a running transaction. +A running RW transaction must be in trx_sys.rw_trx_hash. +@return TRUE if trx->state == state */ +UNIV_INLINE +bool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state, /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ + bool relaxed = false) + /*!< in: whether to allow + trx->state == TRX_STATE_NOT_STARTED + after an error has been reported */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return true if interrupted */ +bool +trx_is_interrupted( +/*===============*/ + const trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Calculates the "weight" of a transaction. The weight of one transaction +is estimated as the number of altered rows + the number of locked rows. +@param t transaction +@return transaction weight */ +#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks)) + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return true if weight(a) >= weight(b) */ +bool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: the transaction to be compared */ + const trx_t* b); /*!< in: the transaction to be compared */ +/* Maximum length of a string that can be returned by +trx_get_que_state_str(). */ +#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */ + +/*******************************************************************//** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. +@return string in the data segment */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + const trx_t* trx); /*!< in: transaction */ + +/** Retreieves the transaction ID. +In a given point in time it is guaranteed that IDs of the running +transactions are unique. The values returned by this function for readonly +transactions may be reused, so a subsequent RO transaction may get the same ID +as a RO transaction that existed in the past. The values returned by this +function should be used for printing purposes only. +@param[in] trx transaction whose id to retrieve +@return transaction id */ +UNIV_INLINE +trx_id_t +trx_get_id_for_print( + const trx_t* trx); + +/** Create the trx_t pool */ +void +trx_pool_init(); + +/** Destroy the trx_t pool */ +void +trx_pool_close(); + +/** +Set the transaction as a read-write transaction if it is not already +tagged as such. +@param[in,out] trx Transaction that needs to be "upgraded" to RW from RO */ +void +trx_set_rw_mode( + trx_t* trx); + +/** +Transactions that aren't started by the MySQL server don't set +the trx_t::mysql_thd field. For such transactions we set the lock +wait timeout to 0 instead of the user configured value that comes +from innodb_lock_wait_timeout via trx_t::mysql_thd. +@param trx transaction +@return lock wait timeout in seconds */ +#define trx_lock_wait_timeout_get(t) \ + ((t)->mysql_thd != NULL \ + ? thd_lock_wait_timeout((t)->mysql_thd) \ + : 0) + +typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> > lock_list; + +/*******************************************************************//** +Latching protocol for trx_lock_t::que_state. trx_lock_t::que_state +captures the state of the query thread during the execution of a query. +This is different from a transaction state. The query state of a transaction +can be updated asynchronously by other threads. The other threads can be +system threads, like the timeout monitor thread or user threads executing +other queries. Another thing to be mindful of is that there is a delay between +when a query thread is put into LOCK_WAIT state and before it actually starts +waiting. Between these two events it is possible that the query thread is +granted the lock it was waiting for, which implies that the state can be changed +asynchronously. + +All these operations take place within the context of locking. Therefore state +changes within the locking code must acquire both the lock mutex and the +trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or +trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient +to only acquire the trx->mutex. +To query the state either of the mutexes is sufficient within the locking +code and no mutex is required when the query thread is no longer waiting. */ + +/** The locks and state of an active transaction. Protected by +lock_sys.mutex, trx->mutex or both. */ +struct trx_lock_t { +#ifdef UNIV_DEBUG + /** number of active query threads; at most 1, except for the + dummy transaction in trx_purge() */ + ulint n_active_thrs; +#endif + trx_que_t que_state; /*!< valid when trx->state + == TRX_STATE_ACTIVE: TRX_QUE_RUNNING, + TRX_QUE_LOCK_WAIT, ... */ + + lock_t* wait_lock; /*!< if trx execution state is + TRX_QUE_LOCK_WAIT, this points to + the lock request, otherwise this is + NULL; set to non-NULL when holding + both trx->mutex and lock_sys.mutex; + set to NULL when holding + lock_sys.mutex; readers should + hold lock_sys.mutex, except when + they are holding trx->mutex and + wait_lock==NULL */ + ib_uint64_t deadlock_mark; /*!< A mark field that is initialized + to and checked against lock_mark_counter + by lock_deadlock_recursive(). */ + bool was_chosen_as_deadlock_victim; + /*!< when the transaction decides to + wait for a lock, it sets this to false; + if another transaction chooses this + transaction as a victim in deadlock + resolution, it sets this to true. + Protected by trx->mutex. */ + time_t wait_started; /*!< lock wait started at this time, + protected only by lock_sys.mutex */ + + que_thr_t* wait_thr; /*!< query thread belonging to this + trx that is in QUE_THR_LOCK_WAIT + state. For threads suspended in a + lock wait, this is protected by + lock_sys.mutex. Otherwise, this may + only be modified by the thread that is + serving the running transaction. */ +#ifdef WITH_WSREP + bool was_chosen_as_wsrep_victim; + /*!< high priority wsrep thread has + marked this trx to abort */ +#endif /* WITH_WSREP */ + + /** Pre-allocated record locks */ + struct { + ib_lock_t lock; byte pad[256]; + } rec_pool[8]; + + /** Pre-allocated table locks */ + ib_lock_t table_pool[8]; + + /** Next available rec_pool[] entry */ + unsigned rec_cached; + + /** Next available table_pool[] entry */ + unsigned table_cached; + + mem_heap_t* lock_heap; /*!< memory heap for trx_locks; + protected by lock_sys.mutex */ + + trx_lock_list_t trx_locks; /*!< locks requested by the transaction; + insertions are protected by trx->mutex + and lock_sys.mutex; removals are + protected by lock_sys.mutex */ + + lock_list table_locks; /*!< All table locks requested by this + transaction, including AUTOINC locks */ + + /** List of pending trx_t::evict_table() */ + UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables; + + bool cancel; /*!< true if the transaction is being + rolled back either via deadlock + detection or due to lock timeout. The + caller has to acquire the trx_t::mutex + in order to cancel the locks. In + lock_trx_table_locks_remove() we + check for this cancel of a transaction's + locks and avoid reacquiring the trx + mutex to prevent recursive deadlocks. + Protected by both the lock sys mutex + and the trx_t::mutex. */ + ulint n_rec_locks; /*!< number of rec locks in this trx */ +}; + +/** Logical first modification time of a table in a transaction */ +class trx_mod_table_time_t +{ + /** First modification of the table */ + undo_no_t first; + /** First modification of a system versioned column */ + undo_no_t first_versioned; + + /** Magic value signifying that a system versioned column of a + table was never modified in a transaction. */ + static const undo_no_t UNVERSIONED = IB_ID_MAX; + +public: + /** Constructor + @param[in] rows number of modified rows so far */ + trx_mod_table_time_t(undo_no_t rows) + : first(rows), first_versioned(UNVERSIONED) {} + +#ifdef UNIV_DEBUG + /** Validation + @param[in] rows number of modified rows so far + @return whether the object is valid */ + bool valid(undo_no_t rows = UNVERSIONED) const + { + return first <= first_versioned && first <= rows; + } +#endif /* UNIV_DEBUG */ + /** @return if versioned columns were modified */ + bool is_versioned() const { return first_versioned != UNVERSIONED; } + + /** After writing an undo log record, set is_versioned() if needed + @param[in] rows number of modified rows so far */ + void set_versioned(undo_no_t rows) + { + ut_ad(!is_versioned()); + first_versioned = rows; + ut_ad(valid()); + } + + /** Invoked after partial rollback + @param[in] limit number of surviving modified rows + @return whether this should be erased from trx_t::mod_tables */ + bool rollback(undo_no_t limit) + { + ut_ad(valid()); + if (first >= limit) { + return true; + } + + if (first_versioned < limit && is_versioned()) { + first_versioned = UNVERSIONED; + } + + return false; + } +}; + +/** Collection of persistent tables and their first modification +in a transaction. +We store pointers to the table objects in memory because +we know that a table object will not be destroyed while a transaction +that modified it is running. */ +typedef std::map< + dict_table_t*, trx_mod_table_time_t, + std::less<dict_table_t*>, + ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > > + trx_mod_tables_t; + +/** The transaction handle + +Normally, there is a 1:1 relationship between a transaction handle +(trx) and a session (client connection). One session is associated +with exactly one user transaction. There are some exceptions to this: + +* For DDL operations, a subtransaction is allocated that modifies the +data dictionary tables. Lock waits and deadlocks are prevented by +acquiring the dict_sys.latch before starting the subtransaction +and releasing it after committing the subtransaction. + +* The purge system uses a special transaction that is not associated +with any session. + +* If the system crashed or it was quickly shut down while there were +transactions in the ACTIVE or PREPARED state, these transactions would +no longer be associated with a session when the server is restarted. + +A session may be served by at most one thread at a time. The serving +thread of a session might change in some MySQL implementations. +Therefore we do not have os_thread_get_curr_id() assertions in the code. + +Normally, only the thread that is currently associated with a running +transaction may access (read and modify) the trx object, and it may do +so without holding any mutex. The following are exceptions to this: + +* trx_rollback_recovered() may access resurrected (connectionless) +transactions (state == TRX_STATE_ACTIVE && is_recovered) +while the system is already processing new user transactions (!is_recovered). + +* trx_print_low() may access transactions not associated with the current +thread. The caller must be holding lock_sys.mutex. + +* When a transaction handle is in the trx_sys.trx_list, some of its fields +must not be modified without holding trx->mutex. + +* The locking code (in particular, lock_deadlock_recursive() and +lock_rec_convert_impl_to_expl()) will access transactions associated +to other connections. The locks of transactions are protected by +lock_sys.mutex (insertions also by trx->mutex). */ + +/** Represents an instance of rollback segment along with its state variables.*/ +struct trx_undo_ptr_t { + trx_rseg_t* rseg; /*!< rollback segment assigned to the + transaction, or NULL if not assigned + yet */ + trx_undo_t* undo; /*!< pointer to the undo log, or + NULL if nothing logged yet */ +}; + +/** An instance of temporary rollback segment. */ +struct trx_temp_undo_t { + /** temporary rollback segment, or NULL if not assigned yet */ + trx_rseg_t* rseg; + /** pointer to the undo log, or NULL if nothing logged yet */ + trx_undo_t* undo; +}; + +/** Rollback segments assigned to a transaction for undo logging. */ +struct trx_rsegs_t { + /** undo log ptr holding reference to a rollback segment that resides in + system/undo tablespace used for undo logging of tables that needs + to be recovered on crash. */ + trx_undo_ptr_t m_redo; + + /** undo log for temporary tables; discarded immediately after + transaction commit/rollback */ + trx_temp_undo_t m_noredo; +}; + +struct trx_t : ilist_node<> { +private: + /** + Count of references. + + We can't release the locks nor commit the transaction until this reference + is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify + that it is no longer "active". + */ + + Atomic_counter<int32_t> n_ref; + + +public: + TrxMutex mutex; /*!< Mutex protecting the fields + state and lock (except some fields + of lock, which are protected by + lock_sys.mutex) */ + + trx_id_t id; /*!< transaction id */ + + /** State of the trx from the point of view of concurrency control + and the valid state transitions. + + Possible states: + + TRX_STATE_NOT_STARTED + TRX_STATE_ACTIVE + TRX_STATE_PREPARED + TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED) + TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED) + + Valid state transitions are: + + Regular transactions: + * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED + + Auto-commit non-locking read-only: + * NOT_STARTED -> ACTIVE -> NOT_STARTED + + XA (2PC): + * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED + + Recovered XA: + * NOT_STARTED -> PREPARED -> COMMITTED -> (freed) + + Recovered XA followed by XA ROLLBACK: + * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed) + + XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT): + * NOT_STARTED -> PREPARED -> (freed) + + Disconnected XA can become recovered: + * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected) + Disconnected means from mysql e.g due to the mysql client disconnection. + Latching and various transaction lists membership rules: + + XA (2PC) transactions are always treated as non-autocommit. + + Transitions to ACTIVE or NOT_STARTED occur when transaction + is not in rw_trx_hash. + + Autocommit non-locking read-only transactions move between states + without holding any mutex. They are not in rw_trx_hash. + + All transactions, unless they are determined to be ac-nl-ro, + explicitly tagged as read-only or read-write, will first be put + on the read-only transaction list. Only when a !read-only transaction + in the read-only list tries to acquire an X or IX lock on a table + do we remove it from the read-only list and put it on the read-write + list. During this switch we assign it a rollback segment. + + When a transaction is NOT_STARTED, it can be in trx_list. It cannot be + in rw_trx_hash. + + ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash. + The transition ACTIVE->PREPARED is protected by trx->mutex. + + ACTIVE->COMMITTED is possible when the transaction is in + rw_trx_hash. + + Transitions to COMMITTED are protected by trx_t::mutex. */ + trx_state_t state; +#ifdef WITH_WSREP + /** whether wsrep_on(mysql_thd) held at the start of transaction */ + bool wsrep; + bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); } + /** true, if BF thread is performing unique secondary index scanning */ + bool wsrep_UK_scan; + bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep_UK_scan); } +#else /* WITH_WSREP */ + bool is_wsrep() const { return false; } +#endif /* WITH_WSREP */ + + ReadView read_view; /*!< consistent read view used in the + transaction, or NULL if not yet set */ + trx_lock_t lock; /*!< Information about the transaction + locks and state. Protected by + lock_sys.mutex (insertions also + by trx_t::mutex). */ + + /* These fields are not protected by any mutex. */ + + /** false=normal transaction, true=recovered (must be rolled back) + or disconnected transaction in XA PREPARE STATE. + + This field is accessed by the thread that owns the transaction, + without holding any mutex. + There is only one foreign-thread access in trx_print_low() + and a possible race condition with trx_disconnect_prepared(). */ + bool is_recovered; + const char* op_info; /*!< English text describing the + current operation, or an empty + string */ + uint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */ + bool check_foreigns; /*!< normally TRUE, but if the user + wants to suppress foreign key checks, + (in table imports, for example) we + set this FALSE */ + /*------------------------------*/ + /* MySQL has a transaction coordinator to coordinate two phase + commit between multiple storage engines and the binary log. When + an engine participates in a transaction, it's responsible for + registering itself using the trans_register_ha() API. */ + bool is_registered; /* This flag is set to true after the + transaction has been registered with + the coordinator using the XA API, and + is set to false after commit or + rollback. */ + /** whether this is holding the prepare mutex */ + bool active_commit_ordered; + /*------------------------------*/ + bool check_unique_secondary; + /*!< normally TRUE, but if the user + wants to speed up inserts by + suppressing unique key checks + for secondary indexes when we decide + if we can use the insert buffer for + them, we set this FALSE */ + bool flush_log_later;/* In 2PC, we hold the + prepare_commit mutex across + both phases. In that case, we + defer flush of the logs to disk + until after we release the + mutex. */ + bool must_flush_log_later;/*!< set in commit() + if flush_log_later was + set and redo log was written; + in that case we will + flush the log in + trx_commit_complete_for_mysql() */ + ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ + trx_dict_op_t dict_operation; /**< @see enum trx_dict_op_t */ + + ib_uint32_t dict_operation_lock_mode; + /*!< 0, RW_S_LATCH, or RW_X_LATCH: + the latch mode trx currently holds + on dict_sys.latch. Protected + by dict_sys.latch. */ + + /** wall-clock time of the latest transition to TRX_STATE_ACTIVE; + used for diagnostic purposes only */ + time_t start_time; + /** microsecond_interval_timer() of transaction start */ + ulonglong start_time_micro; + lsn_t commit_lsn; /*!< lsn at the time of the commit */ + table_id_t table_id; /*!< Table to drop iff dict_operation + == TRX_DICT_OP_TABLE, or 0. */ + /*------------------------------*/ + THD* mysql_thd; /*!< MySQL thread handle corresponding + to this trx, or NULL */ + + const char* mysql_log_file_name; + /*!< if MySQL binlog is used, this field + contains a pointer to the latest file + name; this is NULL if binlog is not + used */ + ulonglong mysql_log_offset; + /*!< if MySQL binlog is used, this + field contains the end offset of the + binlog entry */ + /*------------------------------*/ + ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables + used in the processing of the current + SQL statement in MySQL */ + ib_uint32_t mysql_n_tables_locked; + /*!< how many tables the current SQL + statement uses, except those + in consistent read */ + dberr_t error_state; /*!< 0 if no error, otherwise error + number; NOTE That ONLY the thread + doing the transaction is allowed to + set this field: this is NOT protected + by any mutex */ + const dict_index_t*error_info; /*!< if the error number indicates a + duplicate key error, a pointer to + the problematic index is stored here */ + ulint error_key_num; /*!< if the index creation fails to a + duplicate key error, a mysql key + number of that index is stored here */ + que_t* graph; /*!< query currently run in the session, + or NULL if none; NOTE that the query + belongs to the session, and it can + survive over a transaction commit, if + it is a stored procedure with a COMMIT + WORK statement, for instance */ + /*------------------------------*/ + UT_LIST_BASE_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< savepoints set with SAVEPOINT ..., + oldest first */ + /*------------------------------*/ + undo_no_t undo_no; /*!< next undo log record number to + assign; since the undo log is + private for a transaction, this + is a simple ascending sequence + with no gaps; thus it represents + the number of modified/inserted + rows in a transaction */ + trx_savept_t last_sql_stat_start; + /*!< undo_no when the last sql statement + was started: in case of an error, trx + is rolled back down to this number */ + trx_rsegs_t rsegs; /* rollback segments for undo logging */ + undo_no_t roll_limit; /*!< least undo number to undo during + a partial rollback; 0 otherwise */ + bool in_rollback; /*!< true when the transaction is + executing a partial or full rollback */ + ulint pages_undone; /*!< number of undo log pages undone + since the last undo log truncation */ + /*------------------------------*/ + ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for + an SQL statement. This is useful for + multi-row INSERTs */ + ib_vector_t* autoinc_locks; /* AUTOINC locks held by this + transaction. Note that these are + also in the lock list trx_locks. This + vector needs to be freed explicitly + when the trx instance is destroyed. + Protected by lock_sys.mutex. */ + /*------------------------------*/ + bool read_only; /*!< true if transaction is flagged + as a READ-ONLY transaction. + if auto_commit && !will_lock + then it will be handled as a + AC-NL-RO-SELECT (Auto Commit Non-Locking + Read Only Select). A read only + transaction will not be assigned an + UNDO log. */ + bool auto_commit; /*!< true if it is an autocommit */ + bool will_lock; /*!< set to inform trx_start_low() that + the transaction may acquire locks */ + /*------------------------------*/ + fts_trx_t* fts_trx; /*!< FTS information, or NULL if + transaction hasn't modified tables + with FTS indexes (yet). */ + doc_id_t fts_next_doc_id;/* The document id used for updates */ + /*------------------------------*/ + ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES", + count of tables being flushed. */ + + /*------------------------------*/ + bool ddl; /*!< true if it is an internal + transaction for DDL */ + bool internal; /*!< true if it is a system/internal + transaction background task. This + includes DDL transactions too. Such + transactions are always treated as + read-write. */ + /*------------------------------*/ +#ifdef UNIV_DEBUG + unsigned start_line; /*!< Track where it was started from */ + const char* start_file; /*!< Filename where it was started */ +#endif /* UNIV_DEBUG */ + + XID* xid; /*!< X/Open XA transaction + identification to identify a + transaction branch */ + trx_mod_tables_t mod_tables; /*!< List of tables that were modified + by this transaction */ + /*------------------------------*/ + char* detailed_error; /*!< detailed error message for last + error, or empty. */ + rw_trx_hash_element_t *rw_trx_hash_element; + LF_PINS *rw_trx_hash_pins; + ulint magic_n; + + /** @return whether any persistent undo log has been generated */ + bool has_logged_persistent() const + { + return(rsegs.m_redo.undo); + } + + /** @return whether any undo log has been generated */ + bool has_logged() const + { + return(has_logged_persistent() || rsegs.m_noredo.undo); + } + + /** @return rollback segment for modifying temporary tables */ + trx_rseg_t* get_temp_rseg() + { + if (trx_rseg_t* rseg = rsegs.m_noredo.rseg) { + ut_ad(id != 0); + return(rseg); + } + + return(assign_temp_rseg()); + } + + /** Transition to committed state, to release implicit locks. */ + inline void commit_state(); + + /** Release any explicit locks of a committing transaction. */ + inline void release_locks(); + + /** Evict a table definition due to the rollback of ALTER TABLE. + @param[in] table_id table identifier */ + void evict_table(table_id_t table_id); + + /** Initiate rollback. + @param savept savepoint to which to roll back + @return error code or DB_SUCCESS */ + dberr_t rollback(trx_savept_t *savept= nullptr); + /** Roll back an active transaction. + @param savept savepoint to which to roll back */ + inline void rollback_low(trx_savept_t *savept= nullptr); + /** Finish rollback. + @return whether the rollback was completed normally + @retval false if the rollback was aborted by shutdown */ + inline bool rollback_finish(); +private: + /** Mark a transaction committed in the main memory data structures. */ + inline void commit_in_memory(const mtr_t *mtr); + /** Commit the transaction in a mini-transaction. + @param mtr mini-transaction (if there are any persistent modifications) */ + void commit_low(mtr_t *mtr= nullptr); +public: + /** Commit the transaction. */ + void commit(); + + + bool is_referenced() const { return n_ref > 0; } + + + void reference() + { +#ifdef UNIV_DEBUG + auto old_n_ref= +#endif + n_ref++; + ut_ad(old_n_ref >= 0); + } + + + void release_reference() + { +#ifdef UNIV_DEBUG + auto old_n_ref= +#endif + n_ref--; + ut_ad(old_n_ref > 0); + } + + /** @return whether the table has lock on + mysql.innodb_table_stats and mysql.innodb_index_stats */ + bool has_stats_table_lock() const; + + /** Free the memory to trx_pools */ + void free(); + + + void assert_freed() const + { + ut_ad(state == TRX_STATE_NOT_STARTED); + ut_ad(!id); + ut_ad(!has_logged()); + ut_ad(!is_referenced()); + ut_ad(!is_wsrep()); +#ifdef WITH_WSREP + ut_ad(!lock.was_chosen_as_wsrep_victim); +#endif + ut_ad(!read_view.is_open()); + ut_ad(!lock.wait_thr); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(lock.table_locks.empty()); + ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks)); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); + ut_ad(dict_operation == TRX_DICT_OP_NONE); + } + + /** @return whether this is a non-locking autocommit transaction */ + bool is_autocommit_non_locking() const { return auto_commit && !will_lock; } + +private: + /** Assign a rollback segment for modifying temporary tables. + @return the assigned rollback segment */ + trx_rseg_t *assign_temp_rseg(); +}; + +/** +Check if transaction is started. +@param[in] trx Transaction whose state we need to check +@reutrn true if transaction is in state started */ +inline bool trx_is_started(const trx_t* trx) +{ + return trx->state != TRX_STATE_NOT_STARTED; +} + +/* Transaction isolation levels (trx->isolation_level) */ +#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking + SELECTs are performed so that + we do not look at a possible + earlier version of a record; + thus they are not 'consistent' + reads under this isolation + level; otherwise like level + 2 */ + +#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like + isolation, except that in + range UPDATE and DELETE we + must block phantom rows + with next-key locks; + SELECT ... FOR UPDATE and ... + LOCK IN SHARE MODE only lock + the index records, NOT the + gaps before them, and thus + allow free inserting; + each consistent read reads its + own snapshot */ + +#define TRX_ISO_REPEATABLE_READ 2 /* this is the default; + all consistent reads in the + same trx read the same + snapshot; + full next-key locking used + in locking reads to block + insertions into gaps */ + +#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are + converted to LOCK IN SHARE + MODE reads */ + +/* Treatment of duplicate values (trx->duplicates; for example, in inserts). +Multiple flags can be combined with bitwise OR. */ +#define TRX_DUP_IGNORE 1U /* duplicate rows are to be updated */ +#define TRX_DUP_REPLACE 2U /* duplicate rows are to be replaced */ + + +/** Commit node states */ +enum commit_node_state { + COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to + the transaction */ + COMMIT_NODE_WAIT /*!< commit signal sent to the transaction, + waiting for completion */ +}; + +/** Commit command node in a query graph */ +struct commit_node_t{ + que_common_t common; /*!< node type: QUE_NODE_COMMIT */ + enum commit_node_state + state; /*!< node execution state */ +}; + + +/** Test if trx->mutex is owned. */ +#define trx_mutex_own(t) mutex_own(&t->mutex) + +/** Acquire the trx->mutex. */ +#define trx_mutex_enter(t) do { \ + mutex_enter(&t->mutex); \ +} while (0) + +/** Release the trx->mutex. */ +#define trx_mutex_exit(t) do { \ + mutex_exit(&t->mutex); \ +} while (0) + +#include "trx0trx.ic" + +#endif diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic new file mode 100644 index 00000000..93c9591e --- /dev/null +++ b/storage/innobase/include/trx0trx.ic @@ -0,0 +1,206 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.ic +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx->mutex, or it must be the thread +that is serving a running transaction. +A running RW transaction must be in trx_sys.rw_trx_hash. +@return TRUE if trx->state == state */ +UNIV_INLINE +bool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state, /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ + bool relaxed) + /*!< in: whether to allow + trx->state == TRX_STATE_NOT_STARTED + after an error has been reported */ +{ +#ifdef UNIV_DEBUG + switch (trx->state) { + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + ut_ad(!trx->is_autocommit_non_locking()); + return(trx->state == state); + + case TRX_STATE_ACTIVE: + if (trx->is_autocommit_non_locking()) { + ut_ad(!trx->is_recovered); + ut_ad(trx->read_only); + ut_ad(trx->mysql_thd); + } + return(state == trx->state); + + case TRX_STATE_NOT_STARTED: + /* These states are not allowed for running transactions. */ + ut_a(state == TRX_STATE_NOT_STARTED + || (relaxed + && thd_get_error_number(trx->mysql_thd))); + + return(true); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(trx->state == state); +} + +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx) /*!< in: trx object */ +{ + return(trx->error_info); +} + +/*******************************************************************//** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. +@return string in the data segment */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + const trx_t* trx) /*!< in: transaction */ +{ + /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */ + switch (trx->lock.que_state) { + case TRX_QUE_RUNNING: + return("RUNNING"); + case TRX_QUE_LOCK_WAIT: + return("LOCK WAIT"); + case TRX_QUE_ROLLING_BACK: + return("ROLLING BACK"); + case TRX_QUE_COMMITTING: + return("COMMITTING"); + default: + return("UNKNOWN"); + } +} + +/** Retreieves the transaction ID. +In a given point in time it is guaranteed that IDs of the running +transactions are unique. The values returned by this function for readonly +transactions may be reused, so a subsequent RO transaction may get the same ID +as a RO transaction that existed in the past. The values returned by this +function should be used for printing purposes only. +@param[in] trx transaction whose id to retrieve +@return transaction id */ +UNIV_INLINE +trx_id_t +trx_get_id_for_print( + const trx_t* trx) +{ + /* Readonly and transactions whose intentions are unknown (whether + they will eventually do a WRITE) don't have trx_t::id assigned (it is + 0 for those transactions). Transaction IDs in + innodb_trx.trx_id, + innodb_locks.lock_id, + innodb_locks.lock_trx_id, + innodb_lock_waits.requesting_trx_id, + innodb_lock_waits.blocking_trx_id should match because those tables + could be used in an SQL JOIN on those columns. Also trx_t::id is + printed by SHOW ENGINE INNODB STATUS, and in logs, so we must have the + same value printed everywhere consistently. */ + + /* DATA_TRX_ID_LEN is the storage size in bytes. */ + static const trx_id_t max_trx_id + = (1ULL << (DATA_TRX_ID_LEN * CHAR_BIT)) - 1; + + ut_ad(trx->id <= max_trx_id); + + return(trx->id != 0 + ? trx->id + : reinterpret_cast<trx_id_t>(trx) | (max_trx_id + 1)); +} + +/**********************************************************************//** +Determine if a transaction is a dictionary operation. +@return dictionary operation mode */ +UNIV_INLINE +enum trx_dict_op_t +trx_get_dict_operation( +/*===================*/ + const trx_t* trx) /*!< in: transaction */ +{ + trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation); + +#ifdef UNIV_DEBUG + switch (op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + return(op); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(op); +} +/**********************************************************************//** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + enum trx_dict_op_t op) /*!< in: operation, not + TRX_DICT_OP_NONE */ +{ +#ifdef UNIV_DEBUG + enum trx_dict_op_t old_op = trx_get_dict_operation(trx); + + switch (op) { + case TRX_DICT_OP_NONE: + ut_error; + break; + case TRX_DICT_OP_TABLE: + switch (old_op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_INDEX: + case TRX_DICT_OP_TABLE: + goto ok; + } + ut_error; + break; + case TRX_DICT_OP_INDEX: + ut_ad(old_op == TRX_DICT_OP_NONE); + break; + } +ok: +#endif /* UNIV_DEBUG */ + + trx->ddl = true; + trx->dict_operation = op; +} diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h new file mode 100644 index 00000000..99a9c66c --- /dev/null +++ b/storage/innobase/include/trx0types.h @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0types.h +Transaction system global type definitions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0types_h +#define trx0types_h + +#include "ut0byte.h" +#include "ut0mutex.h" + +#include <vector> + +/** printf(3) format used for printing DB_TRX_ID and other system fields */ +#define TRX_ID_FMT IB_ID_FMT + +/** maximum length that a formatted trx_t::id could take, not including +the terminating NUL character. */ +static const ulint TRX_ID_MAX_LEN = 17; + +/** Space id of the transaction system page (the system tablespace) */ +static const ulint TRX_SYS_SPACE = 0; + +/** Page number of the transaction system page */ +#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO + +/** Random value to check for corruption of trx_t */ +static const ulint TRX_MAGIC_N = 91118598; + +constexpr uint innodb_purge_threads_MAX= 32; + +/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */ +enum trx_que_t { + TRX_QUE_RUNNING, /*!< transaction is running */ + TRX_QUE_LOCK_WAIT, /*!< transaction is waiting for + a lock */ + TRX_QUE_ROLLING_BACK, /*!< transaction is rolling back */ + TRX_QUE_COMMITTING /*!< transaction is committing */ +}; + +/** Transaction states (trx_t::state) */ +enum trx_state_t { + TRX_STATE_NOT_STARTED, + + TRX_STATE_ACTIVE, + /** XA PREPARE has been executed; only XA COMMIT or XA ROLLBACK + are possible */ + TRX_STATE_PREPARED, + /** XA PREPARE transaction that was returned to ha_recover() */ + TRX_STATE_PREPARED_RECOVERED, + TRX_STATE_COMMITTED_IN_MEMORY +}; + +/** Type of data dictionary operation */ +enum trx_dict_op_t { + /** The transaction is not modifying the data dictionary. */ + TRX_DICT_OP_NONE = 0, + /** The transaction is creating a table or an index, or + dropping a table. The table must be dropped in crash + recovery. This and TRX_DICT_OP_NONE are the only possible + operation modes in crash recovery. */ + TRX_DICT_OP_TABLE = 1, + /** The transaction is creating or dropping an index in an + existing table. In crash recovery, the data dictionary + must be locked, but the table must not be dropped. */ + TRX_DICT_OP_INDEX = 2 +}; + +/** Memory objects */ +/* @{ */ +/** Transaction */ +struct trx_t; +/** The locks and state of an active transaction */ +struct trx_lock_t; +/** Rollback segment */ +struct trx_rseg_t; +/** Transaction undo log */ +struct trx_undo_t; +/** Rollback command node in a query graph */ +struct roll_node_t; +/** Commit command node in a query graph */ +struct commit_node_t; +/** SAVEPOINT command node in a query graph */ +struct trx_named_savept_t; +/* @} */ + +/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */ +typedef ib_id_t row_id_t; +/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */ +typedef ib_id_t trx_id_t; +/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */ +typedef ib_id_t roll_ptr_t; +/** Undo number */ +typedef ib_id_t undo_no_t; + +/** Transaction savepoint */ +struct trx_savept_t{ + undo_no_t least_undo_no; /*!< least undo number to undo */ +}; + +/** File objects */ +/* @{ */ +/** Undo segment header */ +typedef byte trx_usegf_t; +/** Undo log header */ +typedef byte trx_ulogf_t; +/** Undo log page header */ +typedef byte trx_upagef_t; + +/** Undo log record */ +typedef byte trx_undo_rec_t; + +/* @} */ + +typedef ib_mutex_t RsegMutex; +typedef ib_mutex_t TrxMutex; +typedef ib_mutex_t PQMutex; +typedef ib_mutex_t TrxSysMutex; + +typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t; +#endif /* trx0types_h */ diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h new file mode 100644 index 00000000..319ea4ee --- /dev/null +++ b/storage/innobase/include/trx0undo.h @@ -0,0 +1,465 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.h +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0undo_h +#define trx0undo_h + +#ifndef UNIV_INNOCHECKSUM +#include "trx0sys.h" + +/** The LSB of the "is insert" flag in DB_ROLL_PTR */ +#define ROLL_PTR_INSERT_FLAG_POS 55 +/** The LSB of the 7-bit trx_rseg_t::id in DB_ROLL_PTR */ +#define ROLL_PTR_RSEG_ID_POS 48 +/** The LSB of the 32-bit undo log page number in DB_ROLL_PTR */ +#define ROLL_PTR_PAGE_POS 16 +/** The LSB of the 16-bit byte offset within an undo log page in DB_ROLL_PTR */ +#define ROLL_PTR_BYTE_POS 0 + +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + bool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + uint32_t page_no, /*!< in: page number */ + uint16_t offset); /*!< in: offset of the undo entry within page */ +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + bool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + uint32_t* page_no, /*!< out: page number */ + uint16_t* offset); /*!< out: offset of the undo + entry within page */ +/***********************************************************************//** +Determine if DB_ROLL_PTR is of the insert type. +@return true if insert */ +UNIV_INLINE +bool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr); /*!< in: roll pointer */ +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ + MY_ATTRIBUTE((warn_unused_result)); +/** Write DB_ROLL_PTR. +@param[out] ptr buffer +@param[in] roll_ptr DB_ROLL_PTR value */ +inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + mach_write_to_7(ptr, roll_ptr); +} +/** Read DB_ROLL_PTR. +@param[in] ptr buffer +@return roll ptr */ +inline roll_ptr_t trx_read_roll_ptr(const byte* ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + return mach_read_from_7(ptr); +} + +/** Gets an undo log page and x-latches it. +@param[in] page_id page id +@param[in,out] mtr mini-transaction +@return pointer to page x-latched */ +UNIV_INLINE +buf_block_t* +trx_undo_page_get(const page_id_t page_id, mtr_t* mtr); + +/** Gets an undo log page and s-latches it. +@param[in] page_id page id +@param[in,out] mtr mini-transaction +@return pointer to page s-latched */ +UNIV_INLINE +buf_block_t* +trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr); + +/** Get the next record in an undo log. +@param[in] undo_page undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@return undo log record, the page latched, NULL if none */ +inline trx_undo_rec_t* +trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec, + uint32_t page_no, uint16_t offset); +/** Get the previous record in an undo log. +@param[in,out] block undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +trx_undo_rec_t* +trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no, + uint16_t offset, bool shared, mtr_t *mtr); +/** Get the next record in an undo log. +@param[in,out] block undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +trx_undo_rec_t* +trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no, + uint16_t offset, mtr_t *mtr); + +/** Get the first record in an undo log. +@param[in] space undo log header space +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH +@param[out] block undo log page +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +trx_undo_rec_t* +trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no, + uint16_t offset, ulint mode, buf_block_t*& block, + mtr_t *mtr); + +/** Initialize an undo log page. +NOTE: This corresponds to a redo log record and must not be changed! +@see mtr_t::undo_create() +@param[in,out] block undo log page */ +void trx_undo_page_init(const buf_block_t &block); + +/** Allocate an undo log page. +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction that does not hold any page latch +@return X-latched block if success +@retval NULL on failure */ +buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Free the last undo log page. The caller must hold the rseg mutex. +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction that does not hold any undo log page + or that has allocated the undo log page */ +void +trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); + +/** Truncate the tail of an undo log during rollback. +@param[in,out] undo undo log +@param[in] limit all undo logs after this limit will be discarded +@param[in] is_temp whether this is temporary undo log */ +void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp); + +/** Truncate the head of an undo log. +NOTE that only whole pages are freed; the header page is not +freed, but emptied, if all the records there are below the limit. +@param[in,out] rseg rollback segment +@param[in] hdr_page_no header page number +@param[in] hdr_offset header offset on the page +@param[in] limit first undo number to preserve +(everything below the limit will be truncated) */ +void +trx_undo_truncate_start( + trx_rseg_t* rseg, + uint32_t hdr_page_no, + uint16_t hdr_offset, + undo_no_t limit); +/** Mark that an undo log header belongs to a data dictionary transaction. +@param[in] trx dictionary transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ +void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr); +/** Assign an undo log for a persistent transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); +/** Assign an undo log for a transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[in] rseg rollback segment +@param[out] undo the undo log +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Sets the state of the undo log segment at a transaction finish. +@return undo log segment header page, x-latched */ +buf_block_t* +trx_undo_set_state_at_finish( +/*=========================*/ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr); /*!< in: mtr */ + +/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. +@param[in,out] trx transaction +@param[in,out] undo undo log +@param[in] rollback false=XA PREPARE, true=XA ROLLBACK +@param[in,out] mtr mini-transaction */ +void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** Free temporary undo log after commit or rollback. +The information is not needed after a commit or rollback, therefore +the data can be discarded. +@param undo temporary undo log */ +void trx_undo_commit_cleanup(trx_undo_t *undo); + +/** At shutdown, frees the undo logs of a transaction. */ +void +trx_undo_free_at_shutdown(trx_t *trx); + +/** Read an undo log when starting up the database. +@param[in,out] rseg rollback segment +@param[in] id rollback segment slot +@param[in] page_no undo log segment page number +@param[in,out] max_trx_id the largest observed transaction ID +@return the undo log +@retval nullptr on error */ +trx_undo_t * +trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no, + trx_id_t &max_trx_id); + +#endif /* !UNIV_INNOCHECKSUM */ + +/** the only rollback segment type since MariaDB 10.3.1 */ +constexpr uint16_t TRX_UNDO_UPDATE= 2; +/* TRX_UNDO_STATE values of an undo log segment */ +/** contains an undo log of an active transaction */ +constexpr uint16_t TRX_UNDO_ACTIVE = 1; +/** cached for quick reuse */ +constexpr uint16_t TRX_UNDO_CACHED = 2; +/** can be freed in purge when all undo data in it is removed */ +constexpr uint16_t TRX_UNDO_TO_PURGE = 4; +/** contains an undo log of a prepared transaction */ +constexpr uint16_t TRX_UNDO_PREPARED = 5; + +#ifndef UNIV_INNOCHECKSUM + +/** Transaction undo log memory object; modified by the thread associated +with the transaction. */ + +struct trx_undo_t { + /*-----------------------------*/ + ulint id; /*!< undo log slot number within the + rollback segment */ + ulint state; /*!< state of the corresponding undo log + segment */ + trx_id_t trx_id; /*!< id of the trx assigned to the undo + log */ + XID xid; /*!< X/Open XA transaction + identification */ + ibool dict_operation; /*!< TRUE if a dict operation trx */ + table_id_t table_id; /*!< if a dict operation, then the table + id */ + trx_rseg_t* rseg; /*!< rseg where the undo log belongs */ + /*-----------------------------*/ + uint32_t hdr_page_no; /*!< page number of the header page in + the undo log */ + uint32_t last_page_no; /*!< page number of the last page in the + undo log; this may differ from + top_page_no during a rollback */ + uint16_t hdr_offset; /*!< header offset of the undo log on + the page */ + uint32_t size; /*!< current size in pages */ + /*-----------------------------*/ + uint32_t top_page_no; /*!< page number where the latest undo + log record was catenated; during + rollback the page from which the latest + undo record was chosen */ + uint16_t top_offset; /*!< offset of the latest undo record, + i.e., the topmost element in the undo + log if we think of it as a stack */ + undo_no_t top_undo_no; /*!< undo number of the latest record + (IB_ID_MAX if the undo log is empty) */ + buf_block_t* guess_block; /*!< guess for the buffer block where + the top page might reside */ + + /** @return whether the undo log is empty */ + bool empty() const { return top_undo_no == IB_ID_MAX; } + + /*-----------------------------*/ + UT_LIST_NODE_T(trx_undo_t) undo_list; + /*!< undo log objects in the rollback + segment are chained into lists */ +}; +#endif /* !UNIV_INNOCHECKSUM */ + +/** The offset of the undo log page header on pages of the undo log */ +#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA +/*-------------------------------------------------------------*/ +/** Transaction undo log page header offsets */ +/* @{ */ +#define TRX_UNDO_PAGE_TYPE 0 /*!< unused; 0 (before MariaDB 10.3.1: + 1=TRX_UNDO_INSERT or + 2=TRX_UNDO_UPDATE) */ +#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log + records for the LATEST transaction + start on this page (remember that + in an update undo log, the first page + can contain several undo logs) */ +#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this + field contains the byte offset of the + first free byte on the page */ +#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain + of undo log pages */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE) + /*!< Size of the transaction undo + log page header, in bytes */ +/* @} */ + +/** An update undo segment with just one page can be reused if it has +at most this many bytes used; we must leave space at least for one new undo +log header on the page */ + +#define TRX_UNDO_PAGE_REUSE_LIMIT (3 << (srv_page_size_shift - 2)) + +/* An update undo log segment may contain several undo logs on its first page +if the undo logs took so little space that the segment could be cached and +reused. All the undo log headers are then on the first page, and the last one +owns the undo log records on subsequent pages if the segment is bigger than +one page. If an undo log is stored in a segment, then on the first page it is +allowed to have zero undo records, but if the segment extends to several +pages, then all the rest of the pages must contain at least one undo log +record. */ + +/** The offset of the undo log segment header on the first page of the undo +log segment */ + +#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) +/** Undo log segment header */ +/* @{ */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */ + +#ifndef UNIV_INNOCHECKSUM + +#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header + on the segment header page, 0 if + none */ +#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which + the undo log segment occupies */ +#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE) + /*!< Base node for the list of pages in + the undo log segment; defined only on + the undo log segment's first page */ +/*-------------------------------------------------------------*/ +/** Size of the undo log segment header */ +#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE) +/* @} */ + +/** The undo log header. There can be several undo log headers on the first +page of an update undo log segment. */ +/* @{ */ +/*-------------------------------------------------------------*/ +/** Transaction start identifier, or 0 if the undo log segment has been +completely purged and trx_purge_free_segment() has started freeing it */ +#define TRX_UNDO_TRX_ID 0 +/** Transaction end identifier (if the log is in a history list), +or 0 if the transaction has not been committed */ +#define TRX_UNDO_TRX_NO 8 +/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of +surviving user records, this used to be called TRX_UNDO_DEL_MARKS. + +The value 1 indicates that purge needs to process the undo log segment. +The value 0 indicates that all of it has been processed, and +trx_purge_free_segment() has been invoked, so the log is not safe to access. + +Before MariaDB 10.3.1, a log segment may carry the value 0 even before +trx_purge_free_segment() was called, for those undo log records for +which purge would not result in removing delete-marked records. */ +#define TRX_UNDO_NEEDS_PURGE 16 +#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record + of this log on the header page; purge + may remove undo log record from the + log start, and therefore this is not + necessarily the same as this log + header end offset */ +#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table + create, index create, or drop + transaction: in recovery + the transaction cannot be rolled back + in the usual way: a 'rollback' rather + means dropping the created or dropped + table, if it still exists */ +#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding + field is TRUE */ +#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header + on this page, 0 if none */ +#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log + header on this page, 0 if none */ +#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history + list, the file list node is here */ +/*-------------------------------------------------------------*/ +/** Size of the undo log header without XID information */ +#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) + +/** X/Open XA Transaction Identification (XID) */ +/* @{ */ +/** xid_t::formatID */ +#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE) +/** xid_t::gtrid_length */ +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +/** xid_t::bqual_length */ +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +/** Distributed transaction identifier data */ +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +/*--------------------------------------------------------------*/ +#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE) + /*!< Total size of the undo log header + with the XA XID */ +/* @} */ + +#include "trx0undo.ic" +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic new file mode 100644 index 00000000..43af9327 --- /dev/null +++ b/storage/innobase/include/trx0undo.ic @@ -0,0 +1,158 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.ic +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" +#include "page0page.h" + +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + bool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + uint32_t page_no, /*!< in: page number */ + uint16_t offset) /*!< in: offset of the undo entry within page */ +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + + return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS | + roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS | + roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset; +} + +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + bool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + uint32_t* page_no, /*!< out: page number */ + uint16_t* offset) /*!< out: offset of the undo + entry within page */ +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(roll_ptr < (1ULL << 56)); + *offset= static_cast<uint16_t>(roll_ptr); + *page_no= static_cast<uint32_t>(roll_ptr >> 16); + *rseg_id= static_cast<ulint>(roll_ptr >> 48 & 0x7F); + *is_insert= static_cast<bool>(roll_ptr >> 55); +} + +/***********************************************************************//** +Determine if DB_ROLL_PTR is of the insert type. +@return true if insert */ +UNIV_INLINE +bool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr) /*!< in: roll pointer */ +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1))); + return static_cast<bool>(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS); +} + +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ +{ + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + return bool(trx_id[DATA_TRX_ID_LEN] >> 7); +} + +/** Gets an undo log page and x-latches it. +@param[in] page_id page id +@param[in,out] mtr mini-transaction +@return pointer to page x-latched */ +UNIV_INLINE +buf_block_t* +trx_undo_page_get(const page_id_t page_id, mtr_t* mtr) +{ + buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + return block; +} + +/** Gets an undo log page and s-latches it. +@param[in] page_id page id +@param[in,out] mtr mini-transaction +@return pointer to page s-latched */ +UNIV_INLINE +buf_block_t* +trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr) +{ + buf_block_t* block = buf_page_get(page_id, 0, RW_S_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return block; +} + +/** Determine the end offset of undo log records of an undo log page. +@param[in] undo_page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset +@return end offset */ +inline +uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no, + uint16_t offset) +{ + if (page_no == undo_page->page.id().page_no()) + if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset + + undo_page->frame)) + return end; + + return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_page->frame); +} + +/** Get the next record in an undo log. +@param[in] undo_page undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@return undo log record, the page latched, NULL if none */ +inline trx_undo_rec_t* +trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec, + uint32_t page_no, uint16_t offset) +{ + uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset); + uint16_t next= mach_read_from_2(undo_page->frame + rec); + return next == end ? nullptr : undo_page->frame + next; +} diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h new file mode 100644 index 00000000..cb5d67cf --- /dev/null +++ b/storage/innobase/include/trx0xa.h @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +#include "handler.h" + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +/** Sizes of transaction identifier */ +#define XIDDATASIZE 128 /*!< maximum size of a transaction + identifier, in bytes */ +#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */ + +#endif +/** X/Open XA distributed transaction status codes */ +/* @{ */ +#define XA_OK 0 /*!< normal execution */ +#define XAER_ASYNC -2 /*!< asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /*!< a resource manager error + occurred in the transaction + branch */ +#define XAER_NOTA -4 /*!< the XID is not valid */ +#define XAER_INVAL -5 /*!< invalid arguments were given */ +#define XAER_PROTO -6 /*!< routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /*!< resource manager unavailable */ +#define XAER_DUPID -8 /*!< the XID already exists */ +#define XAER_OUTSIDE -9 /*!< resource manager doing + work outside transaction */ +/* @} */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i new file mode 100644 index 00000000..6c68bf17 --- /dev/null +++ b/storage/innobase/include/univ.i @@ -0,0 +1,581 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***********************************************************************//** +@file include/univ.i +Version control for database, common definitions, and include files + +Created 1/20/1994 Heikki Tuuri +****************************************************************************/ + +#ifndef univ_i +#define univ_i + +/* aux macros to convert M into "123" (string) if M is defined like +#define M 123 */ +#define _IB_TO_STR(s) #s +#define IB_TO_STR(s) _IB_TO_STR(s) + +/* The following is the InnoDB version as shown in +SELECT plugin_version FROM information_schema.plugins; +calculated in make_version_string() in sql/sql_show.cc like this: +"version >> 8" . "version & 0xff" +because the version is shown with only one dot, we skip the last +component, i.e. we show M.N.P as M.N */ +#define INNODB_VERSION_SHORT \ + (MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR) + +#define INNODB_VERSION_STR \ + IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ + IB_TO_STR(MYSQL_VERSION_MINOR) "." \ + IB_TO_STR(MYSQL_VERSION_PATCH) + +/** How far ahead should we tell the service manager the timeout +(time in seconds) */ +#define INNODB_EXTEND_TIMEOUT_INTERVAL 30 + +#ifdef MYSQL_DYNAMIC_PLUGIN +/* In the dynamic plugin, redefine some externally visible symbols +in order not to conflict with the symbols of a builtin InnoDB. */ + +/* Rename all C++ classes that contain virtual functions, because we +have not figured out how to apply the visibility=hidden attribute to +the virtual method table (vtable) in GCC 3. */ +# define ha_innobase ha_innodb +#endif /* MYSQL_DYNAMIC_PLUGIN */ + +#if defined(_WIN32) +# include <windows.h> +#endif /* _WIN32 */ + +/* Include a minimum number of SQL header files so that few changes +made in SQL code cause a complete InnoDB rebuild. These headers are +used throughout InnoDB but do not include too much themselves. They +support cross-platform development and expose comonly used SQL names. */ + +#include <my_global.h> +#include "my_counter.h" +#include <m_string.h> + +/* JAN: TODO: missing 5.7 header */ +#ifdef HAVE_MY_THREAD_H +//# include <my_thread.h> +#endif + +#ifndef UNIV_INNOCHECKSUM +# include <mysqld_error.h> +#endif /* !UNIV_INNOCHECKSUM */ + +/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */ +#include <sys/stat.h> + +#ifndef _WIN32 +# include <sched.h> +# include "my_config.h" +#endif + +#include <stdint.h> +#include <inttypes.h> +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif + +#include "my_pthread.h" + +/* Following defines are to enable performance schema +instrumentation in each of five InnoDB modules if +HAVE_PSI_INTERFACE is defined. */ +#ifdef HAVE_PSI_INTERFACE +# define UNIV_PFS_MUTEX +# define UNIV_PFS_RWLOCK +# define UNIV_PFS_IO +# define UNIV_PFS_THREAD + +# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */ +# ifdef HAVE_PSI_MEMORY_INTERFACE +# define UNIV_PFS_MEMORY +# endif /* HAVE_PSI_MEMORY_INTERFACE */ + +/* There are mutexes/rwlocks that we want to exclude from +instrumentation even if their corresponding performance schema +define is set. And this PFS_NOT_INSTRUMENTED is used +as the key value to identify those objects that would +be excluded from instrumentation. */ +# define PFS_NOT_INSTRUMENTED ULINT32_UNDEFINED + +# define PFS_IS_INSTRUMENTED(key) ((key) != PFS_NOT_INSTRUMENTED) + +#ifdef HAVE_PFS_THREAD_PROVIDER_H +/* For PSI_MUTEX_CALL() and similar. */ +#include "pfs_thread_provider.h" +#endif + +#include "mysql/psi/mysql_thread.h" +/* For PSI_FILE_CALL(). */ +#ifdef HAVE_PFS_FILE_PROVIDER_H +#include "pfs_file_provider.h" +#endif + +#include "mysql/psi/mysql_file.h" + +#endif /* HAVE_PSI_INTERFACE */ + +#ifdef _WIN32 +# define YY_NO_UNISTD_H 1 +/* VC++ tries to optimise for size by default, from V8+. The size of +the pointer to member depends on whether the type is defined before the +compiler sees the type in the translation unit. This default behaviour +can cause the pointer to be a different size in different translation +units, depending on the above rule. We force optimise for size behaviour +for all cases. This is used by ut0lst.h related code. */ +# pragma pointers_to_members(full_generality, multiple_inheritance) +#endif /* _WIN32 */ + +/* DEBUG VERSION CONTROL + ===================== */ + +/* When this macro is defined then additional test functions will be +compiled. These functions live at the end of each relevant source file +and have "test_" prefix. These functions can be called from the end of +innodb_init() or they can be called from gdb after srv_start() has executed +using the call command. */ +/* +#define UNIV_COMPILE_TEST_FUNCS +#define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR +#define UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH +#define UNIV_ENABLE_UNIT_TEST_DICT_STATS +#define UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT +*/ + +#ifdef DBUG_OFF +# undef UNIV_DEBUG +#elif !defined UNIV_DEBUG +# define UNIV_DEBUG +#endif + +#if 0 +#define UNIV_DEBUG_PRINT /* Enable the compilation of + some debug print functions */ +#define UNIV_AHI_DEBUG /* Enable adaptive hash index + debugging without UNIV_DEBUG */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ +#define UNIV_DEBUG_LOCK_VALIDATE /* Enable + ut_ad(lock_rec_validate_page()) + assertions. */ +#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ +#define UNIV_HASH_DEBUG /* debug HASH_ macros */ +#define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_PERF_DEBUG /* debug flag that enables + light weight performance + related stuff. */ +#define UNIV_SEARCH_PERF_STAT /* statistics for the + adaptive hash index */ +#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output + in sync0sync.cc */ +#define UNIV_BTR_PRINT /* enable functions for + printing B-trees */ +#define UNIV_ZIP_DEBUG /* extensive consistency checks + for compressed pages */ +#define UNIV_ZIP_COPY /* call page_zip_copy_recs() + more often */ +#define UNIV_AIO_DEBUG /* prints info about + submitted and reaped AIO + requests to the log. */ +#define UNIV_STATS_DEBUG /* prints various stats + related debug info from + dict0stats.c */ +#define FTS_INTERNAL_DIAG_PRINT /* FTS internal debugging + info output */ +#endif + +#define UNIV_BTR_DEBUG /* check B-tree links */ +#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */ + +// #define UNIV_SQL_DEBUG + +/* Linkage specifier for non-static InnoDB symbols (variables and functions) +that are only referenced from within InnoDB, not from MySQL. We disable the +GCC visibility directive on all Sun operating systems because there is no +easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */ +#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER) +# define UNIV_INTERN __attribute__((visibility ("hidden"))) +#else +# define UNIV_INTERN +#endif + +#ifndef MY_ATTRIBUTE +#if defined(__GNUC__) +# define MY_ATTRIBUTE(A) __attribute__(A) +#else +# define MY_ATTRIBUTE(A) +#endif +#endif + +#define UNIV_INLINE static inline + +#define UNIV_WORD_SIZE SIZEOF_SIZE_T + +/** The following alignment is used in memory allocations in memory heap +management to ensure correct alignment for doubles etc. */ +#define UNIV_MEM_ALIGNMENT 8U + +/* + DATABASE VERSION CONTROL + ======================== +*/ + +#ifdef HAVE_LZO +#define IF_LZO(A,B) A +#else +#define IF_LZO(A,B) B +#endif + +#ifdef HAVE_LZ4 +#define IF_LZ4(A,B) A +#else +#define IF_LZ4(A,B) B +#endif + +#ifdef HAVE_LZMA +#define IF_LZMA(A,B) A +#else +#define IF_LZMA(A,B) B +#endif + +#ifdef HAVE_BZIP2 +#define IF_BZIP2(A,B) A +#else +#define IF_BZIP2(A,B) B +#endif + +#ifdef HAVE_SNAPPY +#define IF_SNAPPY(A,B) A +#else +#define IF_SNAPPY(A,B) B +#endif + +#if defined (HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32) +#define IF_PUNCH_HOLE(A,B) A +#else +#define IF_PUNCH_HOLE(A,B) B +#endif + +/** log2 of smallest compressed page size (1<<10 == 1024 bytes) +Note: This must never change! */ +#define UNIV_ZIP_SIZE_SHIFT_MIN 10U + +/** log2 of largest compressed page size (1<<14 == 16384 bytes). +A compressed page directory entry reserves 14 bits for the start offset +and 2 bits for flags. This limits the uncompressed page size to 16k. +*/ +#define UNIV_ZIP_SIZE_SHIFT_MAX 14U + +/* Define the Min, Max, Default page sizes. */ +/** Minimum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MIN 12U +/** log2 of largest page size (1<<16 == 64436 bytes). */ +/** Maximum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MAX 16U +/** log2 of default page size (1<<14 == 16384 bytes). */ +/** Default Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_DEF 14U +/** Original 16k InnoDB Page Size Shift, in case the default changes */ +#define UNIV_PAGE_SIZE_SHIFT_ORIG 14U +/** Original 16k InnoDB Page Size as an ssize (log2 - 9) */ +#define UNIV_PAGE_SSIZE_ORIG (UNIV_PAGE_SIZE_SHIFT_ORIG - 9U) + +/** Minimum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MIN (1U << UNIV_PAGE_SIZE_SHIFT_MIN) +/** Maximum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MAX (1U << UNIV_PAGE_SIZE_SHIFT_MAX) +/** Default page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_DEF (1U << UNIV_PAGE_SIZE_SHIFT_DEF) +/** Original 16k page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_ORIG (1U << UNIV_PAGE_SIZE_SHIFT_ORIG) + +/** Smallest compressed page size */ +#define UNIV_ZIP_SIZE_MIN (1U << UNIV_ZIP_SIZE_SHIFT_MIN) + +/** Largest compressed page size */ +#define UNIV_ZIP_SIZE_MAX (1U << UNIV_ZIP_SIZE_SHIFT_MAX) + +/** Largest possible ssize for an uncompressed page. +(The convention 'ssize' is used for 'log2 minus 9' or the number of +shifts starting with 512.) +This max number varies depending on srv_page_size. */ +#define UNIV_PAGE_SSIZE_MAX \ + ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U) + +/** Smallest possible ssize for an uncompressed page. */ +#define UNIV_PAGE_SSIZE_MIN \ + ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U) + +/** Maximum number of parallel threads in a parallelized operation */ +#define UNIV_MAX_PARALLELISM 32 + +/** This is the "mbmaxlen" for my_charset_filename (defined in +strings/ctype-utf8.c), which is used to encode File and Database names. */ +#define FILENAME_CHARSET_MAXNAMLEN 5 + +/** The maximum length of an encode table name in bytes. The max +table and database names are NAME_CHAR_LEN (64) characters. After the +encoding, the max length would be NAME_CHAR_LEN (64) * +FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a +terminating '\0'. InnoDB can handle longer names internally */ +#define MAX_TABLE_NAME_LEN 320 + +/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is +the MySQL's NAME_LEN, see check_and_convert_db_name(). */ +#define MAX_DATABASE_NAME_LEN MAX_TABLE_NAME_LEN + +/** MAX_FULL_NAME_LEN defines the full name path including the +database name and table name. In addition, 14 bytes is added for: + 2 for surrounding quotes around table name + 1 for the separating dot (.) + 9 for the #mysql50# prefix */ +#define MAX_FULL_NAME_LEN \ + (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14) + +/** Maximum length of the compression alogrithm string. Currently we support +only (NONE | ZLIB | LZ4). */ +#define MAX_COMPRESSION_LEN 4 + +/** The maximum length in bytes that a database name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_DB_UTF8_LEN (NAME_LEN + 1) + +/** The maximum length in bytes that a table name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix)) + +/* + UNIVERSAL TYPE DEFINITIONS + ========================== +*/ + +/** Unsigned octet of bits */ +typedef unsigned char byte; +/** Machine-word-width unsigned integer */ +typedef size_t ulint; +/** Machine-word-width signed integer */ +typedef ssize_t lint; + +/** ulint format for the printf() family of functions */ +#define ULINTPF "%zu" +/** ulint hexadecimal format for the printf() family of functions */ +#define ULINTPFx "%zx" + +#ifdef _WIN32 +/* Use the integer types and formatting strings defined in Visual Studio. */ +# define UINT32PF "%u" +# define INT64PF "%lld" +# define UINT64scan "llu" +# define UINT64PFx "%016llx" +#elif defined __APPLE__ +/* Apple prefers to call the 64-bit types 'long long' +in both 32-bit and 64-bit environments. */ +# define UINT32PF "%" PRIu32 +# define INT64PF "%lld" +# define UINT64scan "llu" +# define UINT64PFx "%016llx" +#elif defined _AIX +/* Workaround for macros expension trouble */ +# define UINT32PF "%u" +# define INT64PF "%lld" +# define UINT64scan "lu" +# define UINT64PFx "%016llx" +#else +/* Use the integer types and formatting strings defined in the C99 standard. */ +# define UINT32PF "%" PRIu32 +# define INT64PF "%" PRId64 +# define UINT64scan PRIu64 +# define UINT64PFx "%016" PRIx64 +#endif + +#ifdef UNIV_INNOCHECKSUM +extern bool strict_verify; +extern FILE* log_file; +extern uint32_t cur_page_num; +#endif /* UNIV_INNOCHECKSUM */ + +typedef int64_t ib_int64_t; +typedef uint64_t ib_uint64_t; +typedef uint32_t ib_uint32_t; + +#define UINT64PF "%" UINT64scan +#define IB_ID_FMT UINT64PF + +/** Log sequence number (also used for redo log byte arithmetics) */ +typedef ib_uint64_t lsn_t; + +/** The 'undefined' value for a ulint */ +#define ULINT_UNDEFINED ((ulint)(-1)) + +/** The 'undefined' value for a ib_uint64_t */ +#define UINT64_UNDEFINED ((ib_uint64_t)(-1)) + +/** The bitmask of 32-bit unsigned integer */ +#define ULINT32_MASK 0xFFFFFFFFU +/** The undefined 32-bit unsigned integer */ +#define ULINT32_UNDEFINED ULINT32_MASK + +/** Maximum value for a ulint */ +#define ULINT_MAX ((ulint)(-2)) + +/** Maximum value for ib_uint64_t */ +#define IB_UINT64_MAX ((ib_uint64_t) (~0ULL)) + +/** The generic InnoDB system object identifier data type */ +typedef ib_uint64_t ib_id_t; +#define IB_ID_MAX (~(ib_id_t) 0) +#define IB_ID_FMT UINT64PF + +#ifndef UINTMAX_MAX +#define UINTMAX_MAX IB_UINT64_MAX +#endif +/** This 'ibool' type is used within Innobase. Remember that different included +headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ +#define ibool ulint + +#ifndef TRUE + +#define TRUE 1 +#define FALSE 0 + +#endif + +#define UNIV_NOTHROW + +/** The following number as the length of a logical field means that the field +has the SQL NULL as its value. NOTE that because we assume that the length +of a field is a 32-bit integer when we store it, for example, to an undo log +on disk, we must have also this number fit in 32 bits, also in 64-bit +computers! */ + +#define UNIV_SQL_NULL ULINT32_UNDEFINED + +/** Lengths which are not UNIV_SQL_NULL, but bigger than the following +number indicate that a field contains a reference to an externally +stored part of the field in the tablespace. The length field then +contains the sum of the following flag and the locally stored len. */ + +#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_DEF) + +#if defined(__GNUC__) +/* Tell the compiler that variable/function is unused. */ +# define UNIV_UNUSED MY_ATTRIBUTE ((unused)) +#else +# define UNIV_UNUSED +#endif /* CHECK FOR GCC VER_GT_2 */ + +/* Some macros to improve branch prediction and reduce cache misses */ +#if defined(COMPILER_HINTS) && defined(__GNUC__) +/* Tell the compiler that 'expr' probably evaluates to 'constant'. */ +# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant) +/* Tell the compiler that a pointer is likely to be NULL */ +# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ptr) != 0, 0) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read. */ +# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read or written. */ +# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3) + +/* Sun Studio includes sun_prefetch.h as of version 5.9 */ +#elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC)) + +# include <sun_prefetch.h> + +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) + +# if defined(COMPILER_HINTS) +//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr) +# else +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +# endif /* COMPILER_HINTS */ + +# elif defined __WIN__ && defined COMPILER_HINTS +# include <xmmintrin.h> +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +// __MM_HINT_T0 - (temporal data) +// prefetch data into all levels of the cache hierarchy. +# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0) +# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0) +#else +/* Dummy versions of the macros */ +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +#endif + +/* Tell the compiler that cond is likely to hold */ +#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE) +/* Tell the compiler that cond is unlikely to hold */ +#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE) + +/* Compile-time constant of the given array's size. */ +#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0])) + +/* The return type from a thread's start function differs between Unix and +Windows, so define a typedef for it and a macro to use at the end of such +functions. */ + +#ifdef _WIN32 +typedef DWORD os_thread_ret_t; +# define OS_THREAD_DUMMY_RETURN return(0) +# define OS_PATH_SEPARATOR '\\' +# define OS_PATH_SEPARATOR_ALT '/' +#else +typedef void* os_thread_ret_t; +# define OS_THREAD_DUMMY_RETURN return(NULL) +# define OS_PATH_SEPARATOR '/' +# define OS_PATH_SEPARATOR_ALT '\\' +#endif + +#include <stdio.h> +#include "db0err.h" +#include "ut0dbg.h" +#include "ut0lst.h" +#include "ut0ut.h" +#include "sync0types.h" + +extern ulong srv_page_size_shift; +extern ulong srv_page_size; + +/* Dimension of spatial object we support so far. It has its root in +myisam/sp_defs.h. We only support 2 dimension data */ +#define SPDIMS 2 + +#endif diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h new file mode 100644 index 00000000..1a428d73 --- /dev/null +++ b/storage/innobase/include/ut0byte.h @@ -0,0 +1,117 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0byte.h +Utilities for byte operations + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0byte_h +#define ut0byte_h + +#include "univ.i" + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ + MY_ATTRIBUTE((const)); + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/** Round down a pointer to the nearest aligned address. +@param ptr pointer +@param alignment a power of 2 +@return aligned pointer */ +static inline void *ut_align_down(void *ptr, size_t alignment) +{ + ut_ad(alignment > 0); + ut_ad(ut_is_2pow(alignment)); + ut_ad(ptr); + static_assert(sizeof ptr == sizeof(size_t), "compatibility"); + + return reinterpret_cast<void*>(reinterpret_cast<size_t>(ptr) & + ~(alignment - 1)); +} + +static inline const void *ut_align_down(const void *ptr, size_t alignment) +{ + return ut_align_down(const_cast<void*>(ptr), alignment); +} + +/** Compute the offset of a pointer from the nearest aligned address. +@param ptr pointer +@param alignment a power of 2 +@return distance from aligned pointer */ +inline size_t ut_align_offset(const void *ptr, size_t alignment) +{ + ut_ad(alignment > 0); + ut_ad(ut_is_2pow(alignment)); + ut_ad(ptr); + return reinterpret_cast<size_t>(ptr) & (alignment - 1); +} + +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n); /*!< in: nth bit requested */ +/*****************************************************************//** +Sets the nth bit of a ulint. +@return the ulint with the bit set as requested */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n, /*!< in: nth bit requested */ + ibool val); /*!< in: value for the bit to set */ + +#include "ut0byte.ic" + +#endif diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic new file mode 100644 index 00000000..a4b5d4a7 --- /dev/null +++ b/storage/innobase/include/ut0byte.ic @@ -0,0 +1,109 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0byte.ic +Utilities for byte operations + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ +{ + ut_ad(high <= ULINT32_MASK); + ut_ad(low <= ULINT32_MASK); + return(((ib_uint64_t) high) << 32 | low); +} + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return(n & ~((ib_uint64_t) align_no - 1)); +} + +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ib_uint64_t align_1 = (ib_uint64_t) align_no - 1; + + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return((n + align_1) & ~align_1); +} + +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n) /*!< in: nth bit requested */ +{ + ut_ad(n < 8 * sizeof(ulint)); + return(1 & (a >> n)); +} + +/*****************************************************************//** +Sets the nth bit of a ulint. +@return the ulint with the bit set as requested */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n, /*!< in: nth bit requested */ + ibool val) /*!< in: value for the bit to set */ +{ + ut_ad(n < 8 * sizeof(ulint)); + if (val) { + return(((ulint) 1 << n) | a); + } else { + return(~((ulint) 1 << n) & a); + } +} diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h new file mode 100644 index 00000000..646a5f36 --- /dev/null +++ b/storage/innobase/include/ut0counter.h @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0counter.h + +Counter utility class + +Created 2012/04/12 by Sunny Bains +*******************************************************/ + +#ifndef ut0counter_h +#define ut0counter_h + +#include "os0thread.h" +#include "my_rdtsc.h" + +/** CPU cache line size */ +#ifdef CPU_LEVEL1_DCACHE_LINESIZE +# define CACHE_LINE_SIZE CPU_LEVEL1_DCACHE_LINESIZE +#else +# error CPU_LEVEL1_DCACHE_LINESIZE is undefined +#endif /* CPU_LEVEL1_DCACHE_LINESIZE */ + +/** Default number of slots to use in ib_counter_t */ +#define IB_N_SLOTS 64 + +/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles +as a random value. See the comments for my_timer_cycles() */ +/** @return result from RDTSC or similar functions. */ +static inline size_t +get_rnd_value() +{ + size_t c = static_cast<size_t>(my_timer_cycles()); + + if (c != 0) { + return c; + } + + /* We may go here if my_timer_cycles() returns 0, + so we have to have the plan B for the counter. */ +#if !defined(_WIN32) + return (size_t)os_thread_get_curr_id(); +#else + LARGE_INTEGER cnt; + QueryPerformanceCounter(&cnt); + + return static_cast<size_t>(cnt.QuadPart); +#endif /* !_WIN32 */ +} + +/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic +so the results are not guaranteed to be 100% accurate but close +enough. Creates an array of counters and separates each element by the +CACHE_LINE_SIZE bytes */ +template <typename Type, int N = IB_N_SLOTS> +struct ib_counter_t { + /** Increment the counter by 1. */ + void inc() { add(1); } + + /** Increment the counter by 1. + @param[in] index a reasonably thread-unique identifier */ + void inc(size_t index) { add(index, 1); } + + /** Add to the counter. + @param[in] n amount to be added */ + void add(Type n) { add(get_rnd_value(), n); } + + /** Add to the counter. + @param[in] index a reasonably thread-unique identifier + @param[in] n amount to be added */ + void add(size_t index, Type n) { + index = index % N; + + ut_ad(index < UT_ARR_SIZE(m_counter)); + + m_counter[index].value.fetch_add(n, std::memory_order_relaxed); + } + + /* @return total value - not 100% accurate, since it is relaxed atomic*/ + operator Type() const { + Type total = 0; + + for (const auto &counter : m_counter) { + total += counter.value.load(std::memory_order_relaxed); + } + + return(total); + } + +private: + /** Atomic which occupies whole CPU cache line. + Note: We rely on the default constructor of std::atomic and + do not explicitly initialize the contents. This works for us, + because ib_counter_t is only intended for usage with global + memory that is allocated from the .bss and thus guaranteed to + be zero-initialized by the run-time environment. + @see srv_stats + @see rw_lock_stats */ + struct ib_counter_element_t { + MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<Type> value; + }; + static_assert(sizeof(ib_counter_element_t) == CACHE_LINE_SIZE, ""); + + /** Array of counter elements */ + MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_element_t m_counter[N]; +}; + +#endif /* ut0counter_h */ diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h new file mode 100644 index 00000000..0cbccb97 --- /dev/null +++ b/storage/innobase/include/ut0crc32.h @@ -0,0 +1,37 @@ +/***************************************************************************** + +Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0crc32.h +CRC32 implementation + +Created Aug 10, 2011 Vasil Dimov +*******************************************************/ + +#ifndef ut0crc32_h +#define ut0crc32_h + +#include "univ.i" +#include <my_sys.h> +static inline uint32_t ut_crc32(const byte *s, size_t size) +{ + return my_crc32c(0, s, size); +} + +#endif /* ut0crc32_h */ diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h new file mode 100644 index 00000000..85856660 --- /dev/null +++ b/storage/innobase/include/ut0dbg.h @@ -0,0 +1,179 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*****************************************************************//** +@file include/ut0dbg.h +Debug utilities for Innobase + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#ifndef ut0dbg_h +#define ut0dbg_h + +#ifdef UNIV_INNOCHECKSUM +#define ut_a assert +#define ut_ad assert +#define ut_error assert(0) +#else /* !UNIV_INNOCHECKSUM */ + +/* Do not include univ.i because univ.i includes this. */ + +/*************************************************************//** +Report a failed assertion. */ +ATTRIBUTE_NORETURN ATTRIBUTE_COLD __attribute__((nonnull(2))) +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /*!< in: the failed assertion */ + const char* file, /*!< in: source file containing the assertion */ + unsigned line); /*!< in: line number of the assertion */ + +/** Abort execution if EXPR does not evaluate to nonzero. +@param EXPR assertion expression that should hold */ +#define ut_a(EXPR) do { \ + if (UNIV_UNLIKELY(!(ulint) (EXPR))) { \ + ut_dbg_assertion_failed(#EXPR, \ + __FILE__, __LINE__); \ + } \ +} while (0) + +/** Abort execution. */ +#define ut_error \ + ut_dbg_assertion_failed(0, __FILE__, __LINE__) + +/** Debug assertion */ +#define ut_ad DBUG_SLOW_ASSERT +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) EXPR +#else +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) +#endif + +#if defined(HAVE_SYS_TIME_H) && defined(HAVE_SYS_RESOURCE_H) + +#define HAVE_UT_CHRONO_T + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> + +/** A "chronometer" used to clock snippets of code. +Example usage: + ut_chrono_t ch("this loop"); + for (;;) { ... } + ch.show(); +would print the timings of the for() loop, prefixed with "this loop:" */ +class ut_chrono_t { +public: + /** Constructor. + @param[in] name chrono's name, used when showing the values */ + ut_chrono_t( + const char* name) + : + m_name(name), + m_show_from_destructor(true) + { + reset(); + } + + /** Resets the chrono (records the current time in it). */ + void + reset() + { + gettimeofday(&m_tv, NULL); + + getrusage(RUSAGE_SELF, &m_ru); + } + + /** Shows the time elapsed and usage statistics since the last reset. */ + void + show() + { + struct rusage ru_now; + struct timeval tv_now; + struct timeval tv_diff; + + getrusage(RUSAGE_SELF, &ru_now); + + gettimeofday(&tv_now, NULL); + +#ifndef timersub +#define timersub(a, b, r) \ + do { \ + (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((r)->tv_usec < 0) { \ + (r)->tv_sec--; \ + (r)->tv_usec += 1000000; \ + } \ + } while (0) +#endif /* timersub */ + +#define CHRONO_PRINT(type, tvp) \ + fprintf(stderr, "%s: %s% 5ld.%06ld sec\n", \ + m_name, type, \ + static_cast<long>((tvp)->tv_sec), \ + static_cast<long>((tvp)->tv_usec)) + + timersub(&tv_now, &m_tv, &tv_diff); + CHRONO_PRINT("real", &tv_diff); + + timersub(&ru_now.ru_utime, &m_ru.ru_utime, &tv_diff); + CHRONO_PRINT("user", &tv_diff); + + timersub(&ru_now.ru_stime, &m_ru.ru_stime, &tv_diff); + CHRONO_PRINT("sys ", &tv_diff); + } + + /** Cause the timings not to be printed from the destructor. */ + void end() + { + m_show_from_destructor = false; + } + + /** Destructor. */ + ~ut_chrono_t() + { + if (m_show_from_destructor) { + show(); + } + } + +private: + /** Name of this chronometer. */ + const char* m_name; + + /** True if the current timings should be printed by the destructor. */ + bool m_show_from_destructor; + + /** getrusage() result as of the last reset(). */ + struct rusage m_ru; + + /** gettimeofday() result as of the last reset(). */ + struct timeval m_tv; +}; + +#endif /* HAVE_SYS_TIME_H && HAVE_SYS_RESOURCE_H */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h new file mode 100644 index 00000000..7e27e108 --- /dev/null +++ b/storage/innobase/include/ut0list.h @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.h +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A double-linked list. This differs from the one in ut0lst.h in that in this +one, each list node contains a pointer to the data, whereas the one in +ut0lst.h uses a strategy where the list pointers are embedded in the data +items themselves. + +Use this one when you need to store arbitrary data in the list where you +can't embed the list pointers in the data, if a data item needs to be +stored in multiple lists, etc. + +Note about the memory management: ib_list_t is a fixed-size struct whose +allocation/deallocation is done through ib_list_create/ib_list_free, but the +memory for the list nodes is allocated through a user-given memory heap, +which can either be the same for all nodes or vary per node. Most users will +probably want to create a memory heap to store the item-specific data, and +pass in this same heap to the list node creation functions, thus +automatically freeing the list node when the item's heap is freed. + +************************************************************************/ + +#ifndef IB_LIST_H +#define IB_LIST_H + +#include "mem0mem.h" + +struct ib_list_t; +struct ib_list_node_t; + +/****************************************************************//** +Create a new list using mem_alloc. Lists created with this function must be +freed with ib_list_free. +@return list */ +ib_list_t* +ib_list_create(void); +/*=================*/ + +/****************************************************************//** +Free a list. */ +void +ib_list_free( +/*=========*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Add the data to the end of the list. +@return new list node */ +ib_list_node_t* +ib_list_add_last( +/*=============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Remove the node from the list. */ +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* node); /*!< in: node to remove */ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list); /*!< in: list */ + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else */ + const ib_list_t* list); /* in: list */ + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list); /*<! in: list */ + +/* List. */ +struct ib_list_t { + ib_list_node_t* first; /*!< first node */ + ib_list_node_t* last; /*!< last node */ +}; + +/* A list node. */ +struct ib_list_node_t { + ib_list_node_t* prev; /*!< previous node */ + ib_list_node_t* next; /*!< next node */ + void* data; /*!< user data */ +}; + +/* Quite often, the only additional piece of data you need is the per-item +memory heap, so we have this generic struct available to use in those +cases. */ +struct ib_list_helper_t { + mem_heap_t* heap; /*!< memory heap */ + void* data; /*!< user data */ +}; + +#include "ut0list.ic" + +#endif diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic new file mode 100644 index 00000000..3bdba52b --- /dev/null +++ b/storage/innobase/include/ut0list.ic @@ -0,0 +1,80 @@ +/***************************************************************************** + +Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.ic +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->first); +} + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->last); +} + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else FALSE */ + const ib_list_t* list) /* in: list */ +{ + return(!(list->first || list->last)); +} + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*<! in: list */ +{ + ulint len = 0; + ib_list_node_t* node = list->first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h new file mode 100644 index 00000000..9a5f3059 --- /dev/null +++ b/storage/innobase/include/ut0lst.h @@ -0,0 +1,568 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0lst.h +List utilities + +Created 9/10/1995 Heikki Tuuri +Rewritten by Sunny Bains Dec 2011. +***********************************************************************/ + +#ifndef ut0lst_h +#define ut0lst_h + +/* Do not include univ.i because univ.i includes this. */ + +#include "ut0dbg.h" + +/* This module implements the two-way linear list. Note that a single +list node may belong to two or more lists, but is only on one list +at a time. */ + +/*******************************************************************//** +The two way list node. +@param TYPE the list node type name */ +template <typename Type> +struct ut_list_node { + Type* prev; /*!< pointer to the previous + node, NULL if start of list */ + Type* next; /*!< pointer to next node, + NULL if end of list */ + + void reverse() + { + Type* tmp = prev; + prev = next; + next = tmp; + } +}; + +/** Macro used for legacy reasons */ +#define UT_LIST_NODE_T(t) ut_list_node<t> + +/*******************************************************************//** +The two-way list base node. The base node contains pointers to both ends +of the list and a count of nodes in the list (excluding the base node +from the count). We also store a pointer to the member field so that it +doesn't have to be specified when doing list operations. +@param Type the type of the list element +@param NodePtr field member pointer that points to the list node */ +template <typename Type, typename NodePtr> +struct ut_list_base { + typedef Type elem_type; + typedef NodePtr node_ptr; + typedef ut_list_node<Type> node_type; + + ulint count; /*!< count of nodes in list */ + elem_type* start; /*!< pointer to list start, + NULL if empty */ + elem_type* end; /*!< pointer to list end, + NULL if empty */ + node_ptr node; /*!< Pointer to member field + that is used as a link node */ +#ifdef UNIV_DEBUG + ulint init; /*!< UT_LIST_INITIALISED if + the list was initialised with + UT_LIST_INIT() */ +#endif /* UNIV_DEBUG */ + + void reverse() + { + Type* tmp = start; + start = end; + end = tmp; + } +}; + +#define UT_LIST_BASE_NODE_T(t) ut_list_base<t, ut_list_node<t> t::*> + +#ifdef UNIV_DEBUG +# define UT_LIST_INITIALISED 0xCAFE +# define UT_LIST_INITIALISE(b) (b).init = UT_LIST_INITIALISED +# define UT_LIST_IS_INITIALISED(b) ut_a(((b).init == UT_LIST_INITIALISED)) +#else +# define UT_LIST_INITIALISE(b) +# define UT_LIST_IS_INITIALISED(b) +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Note: This is really the list constructor. We should be able to use +placement new here. +Initializes the base node of a two-way list. +@param b the list base node +@param pmf point to member field that will be used as the link node */ +#define UT_LIST_INIT(b, pmf) \ +{ \ + (b).count = 0; \ + (b).start = 0; \ + (b).end = 0; \ + (b).node = pmf; \ + UT_LIST_INITIALISE(b); \ +} + +/** Functor for accessing the embedded node within a list element. This is +required because some lists can have the node emebedded inside a nested +struct/union. See lock0priv.h (table locks) for an example. It provides a +specialised functor to grant access to the list node. */ +template <typename Type> +struct GenericGetNode { + + typedef ut_list_node<Type> node_type; + + GenericGetNode(node_type Type::* node) : m_node(node) {} + + node_type& operator() (Type& elem) + { + return(elem.*m_node); + } + + node_type Type::*m_node; +}; + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param list the base node (not a pointer to it) +@param elem the element to add */ +template <typename List> +void +ut_list_prepend( + List& list, + typename List::elem_type* elem) +{ + typename List::node_type& elem_node = elem->*list.node; + + UT_LIST_IS_INITIALISED(list); + + elem_node.prev = 0; + elem_node.next = list.start; + + if (list.start != 0) { + typename List::node_type& base_node = + list.start->*list.node; + + ut_ad(list.start != elem); + + base_node.prev = elem; + } + + list.start = elem; + + if (list.end == 0) { + list.end = elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param LIST the base node (not a pointer to it) +@param ELEM the element to add */ +#define UT_LIST_ADD_FIRST(LIST, ELEM) ut_list_prepend(LIST, ELEM) + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param list list +@param elem the element to add +@param get_node to get the list node for that element */ +template <typename List, typename Functor> +void +ut_list_append( + List& list, + typename List::elem_type* elem, + Functor get_node) +{ + typename List::node_type& node = get_node(*elem); + + UT_LIST_IS_INITIALISED(list); + + node.next = 0; + node.prev = list.end; + + if (list.end != 0) { + typename List::node_type& base_node = get_node(*list.end); + + ut_ad(list.end != elem); + + base_node.next = elem; + } + + list.end = elem; + + if (list.start == 0) { + list.start = elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param list list +@param elem the element to add */ +template <typename List> +void +ut_list_append( + List& list, + typename List::elem_type* elem) +{ + ut_list_append( + list, elem, + GenericGetNode<typename List::elem_type>(list.node)); +} + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param LIST list base node (not a pointer to it) +@param ELEM the element to add */ +#define UT_LIST_ADD_LAST(LIST, ELEM) ut_list_append(LIST, ELEM) + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param list the base node +@param elem1 node after which ELEM2 is inserted +@param elem2 node being inserted after ELEM1 */ +template <typename List> +void +ut_list_insert( + List& list, + typename List::elem_type* elem1, + typename List::elem_type* elem2) +{ + ut_ad(elem1 != elem2); + UT_LIST_IS_INITIALISED(list); + + typename List::node_type& elem1_node = elem1->*list.node; + typename List::node_type& elem2_node = elem2->*list.node; + + elem2_node.prev = elem1; + elem2_node.next = elem1_node.next; + + if (elem1_node.next != NULL) { + typename List::node_type& next_node = + elem1_node.next->*list.node; + + next_node.prev = elem2; + } + + elem1_node.next = elem2; + + if (list.end == elem1) { + list.end = elem2; + } + + ++list.count; +} + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param LIST list base node (not a pointer to it) +@param ELEM1 node after which ELEM2 is inserted +@param ELEM2 node being inserted after ELEM1 */ +#define UT_LIST_INSERT_AFTER(LIST, ELEM1, ELEM2) \ + ut_list_insert(LIST, ELEM1, ELEM2) + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param list the base node +@param elem1 node after which ELEM2 is inserted +@param elem2 node being inserted after ELEM1 +@param get_node to get the list node for that element */ + +template <typename List, typename Functor> +void +ut_list_insert( + List& list, + typename List::elem_type* elem1, + typename List::elem_type* elem2, + Functor get_node) +{ + ut_ad(elem1 != elem2); + UT_LIST_IS_INITIALISED(list); + + typename List::node_type& elem1_node = get_node(*elem1); + typename List::node_type& elem2_node = get_node(*elem2); + + elem2_node.prev = elem1; + elem2_node.next = elem1_node.next; + + if (elem1_node.next != NULL) { + typename List::node_type& next_node = + get_node(*elem1_node.next); + + next_node.prev = elem2; + } + + elem1_node.next = elem2; + + if (list.end == elem1) { + list.end = elem2; + } + + ++list.count; + +} +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param node member node within list element that is to be removed +@param get_node functor to get the list node from elem */ +template <typename List, typename Functor> +void +ut_list_remove( + List& list, + typename List::node_type& node, + Functor get_node) +{ + ut_a(list.count > 0); + UT_LIST_IS_INITIALISED(list); + + if (node.next != NULL) { + typename List::node_type& next_node = + get_node(*node.next); + + next_node.prev = node.prev; + } else { + list.end = node.prev; + } + + if (node.prev != NULL) { + typename List::node_type& prev_node = + get_node(*node.prev); + + prev_node.next = node.next; + } else { + list.start = node.next; + } + + node.next = 0; + node.prev = 0; + + --list.count; +} + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param elem element to be removed from the list +@param get_node functor to get the list node from elem */ +template <typename List, typename Functor> +void +ut_list_remove( + List& list, + typename List::elem_type* elem, + Functor get_node) +{ + ut_list_remove(list, get_node(*elem), get_node); +} + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param elem element to be removed from the list */ +template <typename List> +void +ut_list_remove( + List& list, + typename List::elem_type* elem) +{ + ut_list_remove( + list, elem->*list.node, + GenericGetNode<typename List::elem_type>(list.node)); +} + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param LIST the base node (not a pointer to it) +@param ELEM node to be removed from the list */ +#define UT_LIST_REMOVE(LIST, ELEM) ut_list_remove(LIST, ELEM) + +/********************************************************************//** +Gets the next node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the successor of N in NAME, or NULL */ +#define UT_LIST_GET_NEXT(NAME, N) (((N)->NAME).next) + +/********************************************************************//** +Gets the previous node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the predecessor of N in NAME, or NULL */ +#define UT_LIST_GET_PREV(NAME, N) (((N)->NAME).prev) + +/********************************************************************//** +Alternative macro to get the number of nodes in a two-way list, i.e., +its length. +@param BASE the base node (not a pointer to it). +@return the number of nodes in the list */ +#define UT_LIST_GET_LEN(BASE) (BASE).count + +/********************************************************************//** +Gets the first node in a two-way list. +@param BASE the base node (not a pointer to it) +@return first node, or NULL if the list is empty */ +#define UT_LIST_GET_FIRST(BASE) (BASE).start + +/********************************************************************//** +Gets the last node in a two-way list. +@param BASE the base node (not a pointer to it) +@return last node, or NULL if the list is empty */ +#define UT_LIST_GET_LAST(BASE) (BASE).end + +struct NullValidate { void operator()(const void*) const {} }; + +/** Iterate over all the elements and call the functor for each element. +@param[in] list base node (not a pointer to it) +@param[in,out] functor Functor that is called for each element in the list */ +template <typename List, class Functor> +inline void ut_list_map(const List& list, Functor& functor) +{ + ulint count = 0; + + UT_LIST_IS_INITIALISED(list); + + for (typename List::elem_type* elem = list.start; elem; + elem = (elem->*list.node).next, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/** Iterate over all the elements and call the functor for each element. +@param[in] list base node (not a pointer to it) +@param[in] functor Functor that is called for each element in the list */ +template <typename List, class Functor> +inline void ut_list_map(const List& list, const Functor& functor) +{ + ulint count = 0; + + UT_LIST_IS_INITIALISED(list); + + for (typename List::elem_type* elem = list.start; elem; + elem = (elem->*list.node).next, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/** Check the consistency of a doubly linked list. +@param[in] list base node (not a pointer to it) +@param[in,out] functor Functor that is called for each element in the list */ +template <typename List, class Functor> +void ut_list_validate(const List& list, Functor& functor) +{ + ut_list_map(list, functor); + + /* Validate the list backwards. */ + ulint count = 0; + + for (typename List::elem_type* elem = list.end; + elem != 0; + elem = (elem->*list.node).prev) { + ++count; + } + + ut_a(count == list.count); +} + +/** Check the consistency of a doubly linked list. +@param[in] list base node (not a pointer to it) +@param[in] functor Functor that is called for each element in the list */ +template <typename List, class Functor> +inline void ut_list_validate(const List& list, const Functor& functor) +{ + ut_list_map(list, functor); + + /* Validate the list backwards. */ + ulint count = 0; + + for (typename List::elem_type* elem = list.end; + elem != 0; + elem = (elem->*list.node).prev) { + ++count; + } + + ut_a(count == list.count); +} + +template <typename List> +inline void ut_list_validate(const List& list) +{ + ut_list_validate(list, NullValidate()); +} + +#ifdef UNIV_DEBUG +template <typename List> +inline void ut_list_reverse(List& list) +{ + UT_LIST_IS_INITIALISED(list); + + for (typename List::elem_type* elem = list.start; + elem != 0; + elem = (elem->*list.node).prev) { + (elem->*list.node).reverse(); + } + + list.reverse(); +} + +/** Check if the given element exists in the list. +@param[in,out] list the list object +@param[in] elem the element of the list which will be checked */ +template <typename List> +inline bool ut_list_exists(const List& list, typename List::elem_type* elem) +{ + for (typename List::elem_type* e1 = UT_LIST_GET_FIRST(list); e1; + e1 = (e1->*list.node).next) { + if (elem == e1) { + return true; + } + } + return false; +} +#endif + +/** Move the given element to the beginning of the list. +@param[in,out] list the list object +@param[in] elem the element of the list which will be moved + to the beginning of the list. */ +template <typename List> +void +ut_list_move_to_front( + List& list, + typename List::elem_type* elem) +{ + ut_ad(ut_list_exists(list, elem)); + + if (UT_LIST_GET_FIRST(list) != elem) { + ut_list_remove(list, elem); + ut_list_prepend(list, elem); + } +} + +#ifdef UNIV_DEBUG +#endif + +#endif /* ut0lst.h */ diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h new file mode 100644 index 00000000..2fc864d4 --- /dev/null +++ b/storage/innobase/include/ut0mem.h @@ -0,0 +1,76 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.h +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#ifndef ut0mem_h +#define ut0mem_h + +#include "univ.i" + +/******************************************************************** +Concatenate 3 strings.*/ +char* +ut_str3cat( +/*=======*/ + /* out, own: concatenated string, must be + freed with ut_free() */ + const char* s1, /* in: string 1 */ + const char* s2, /* in: string 2 */ + const char* s3); /* in: string 3 */ + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size); /*!< in: "hex" size in bytes */ + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +#include "ut0mem.ic" + +#endif diff --git a/storage/innobase/include/ut0mem.ic b/storage/innobase/include/ut0mem.ic new file mode 100644 index 00000000..cc95a036 --- /dev/null +++ b/storage/innobase/include/ut0mem.ic @@ -0,0 +1,246 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.ic +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#include "ut0byte.h" +#include "mach0data.h" + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size) /*!< in: "hex" size in bytes */ +{ + +#ifdef WORDS_BIGENDIAN + +#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b)) + +#define UINT16_GET_A(u) ((char) ((u) >> 8)) +#define UINT16_GET_B(u) ((char) ((u) & 0xFF)) + +#else /* WORDS_BIGENDIAN */ + +#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a)) + +#define UINT16_GET_A(u) ((char) ((u) & 0xFF)) +#define UINT16_GET_B(u) ((char) ((u) >> 8)) + +#endif /* WORDS_BIGENDIAN */ + +#define MK_ALL_UINT16_WITH_A(a) \ + MK_UINT16(a, '0'), \ + MK_UINT16(a, '1'), \ + MK_UINT16(a, '2'), \ + MK_UINT16(a, '3'), \ + MK_UINT16(a, '4'), \ + MK_UINT16(a, '5'), \ + MK_UINT16(a, '6'), \ + MK_UINT16(a, '7'), \ + MK_UINT16(a, '8'), \ + MK_UINT16(a, '9'), \ + MK_UINT16(a, 'A'), \ + MK_UINT16(a, 'B'), \ + MK_UINT16(a, 'C'), \ + MK_UINT16(a, 'D'), \ + MK_UINT16(a, 'E'), \ + MK_UINT16(a, 'F') + + static const uint16 hex_map[256] = { + MK_ALL_UINT16_WITH_A('0'), + MK_ALL_UINT16_WITH_A('1'), + MK_ALL_UINT16_WITH_A('2'), + MK_ALL_UINT16_WITH_A('3'), + MK_ALL_UINT16_WITH_A('4'), + MK_ALL_UINT16_WITH_A('5'), + MK_ALL_UINT16_WITH_A('6'), + MK_ALL_UINT16_WITH_A('7'), + MK_ALL_UINT16_WITH_A('8'), + MK_ALL_UINT16_WITH_A('9'), + MK_ALL_UINT16_WITH_A('A'), + MK_ALL_UINT16_WITH_A('B'), + MK_ALL_UINT16_WITH_A('C'), + MK_ALL_UINT16_WITH_A('D'), + MK_ALL_UINT16_WITH_A('E'), + MK_ALL_UINT16_WITH_A('F') + }; + const unsigned char* rawc; + ulint read_bytes; + ulint write_bytes; + ulint i; + + rawc = (const unsigned char*) raw; + + if (hex_size == 0) { + + return(0); + } + + if (hex_size <= 2 * raw_size) { + + read_bytes = hex_size / 2; + write_bytes = hex_size; + } else { + + read_bytes = raw_size; + write_bytes = 2 * raw_size + 1; + } + +#define LOOP_READ_BYTES(ASSIGN) \ + for (i = 0; i < read_bytes; i++) { \ + ASSIGN; \ + hex += 2; \ + rawc++; \ + } + + if (ut_align_offset(hex, 2) == 0) { + + LOOP_READ_BYTES( + *(uint16*) hex = hex_map[*rawc] + ); + } else { + + LOOP_READ_BYTES( + *hex = UINT16_GET_A(hex_map[*rawc]); + *(hex + 1) = UINT16_GET_B(hex_map[*rawc]) + ); + } + + if (hex_size <= 2 * raw_size && hex_size % 2 == 0) { + + hex--; + } + + *hex = '\0'; + + return(write_bytes); +} + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint str_i; + ulint buf_i; + + buf_i = 0; + + switch (buf_size) { + case 3: + + if (str_len == 0) { + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\''; + buf_i++; + } + /* FALLTHROUGH */ + case 2: + case 1: + + buf[buf_i] = '\0'; + buf_i++; + /* FALLTHROUGH */ + case 0: + + return(buf_i); + } + + /* buf_size >= 4 */ + + buf[0] = '\''; + buf_i = 1; + + for (str_i = 0; str_i < str_len; str_i++) { + + char ch; + + if (buf_size - buf_i == 2) { + + break; + } + + ch = str[str_i]; + + switch (ch) { + case '\0': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = '\\'; + buf_i++; + buf[buf_i] = '0'; + buf_i++; + break; + case '\'': + case '\\': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = ch; + buf_i++; + /* FALLTHROUGH */ + default: + + buf[buf_i] = ch; + buf_i++; + } + } + +func_exit: + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\0'; + buf_i++; + + return(buf_i); +} diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h new file mode 100644 index 00000000..cb43583c --- /dev/null +++ b/storage/innobase/include/ut0mutex.h @@ -0,0 +1,178 @@ +/***************************************************************************** + +Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0mutex.h +Policy based mutexes. + +Created 2012-03-24 Sunny Bains. +***********************************************************************/ + +#pragma once +#ifndef UNIV_INNOCHECKSUM +#include "sync0policy.h" +#include "ib0mutex.h" + +/** Create a typedef using the MutexType<PolicyType> +@param[in] M Mutex type +@param[in[ P Policy type +@param[in] T The resulting typedef alias */ +#define UT_MUTEX_TYPE(M, P, T) typedef PolicyMutex<M<P> > T; + +# ifdef __linux__ +UT_MUTEX_TYPE(TTASFutexMutex, GenericPolicy, FutexMutex); +# endif /* __linux__ */ + +UT_MUTEX_TYPE(TTASMutex, GenericPolicy, SpinMutex); +UT_MUTEX_TYPE(OSTrackMutex, GenericPolicy, SysMutex); +UT_MUTEX_TYPE(TTASEventMutex, GenericPolicy, SyncArrayMutex); + +#ifdef MUTEX_FUTEX +/** The default mutex type. */ +typedef FutexMutex ib_mutex_t; +#define MUTEX_TYPE "Uses futexes" +#elif defined(MUTEX_SYS) +typedef SysMutex ib_mutex_t; +#define MUTEX_TYPE "Uses system mutexes" +#elif defined(MUTEX_EVENT) +typedef SyncArrayMutex ib_mutex_t; +#define MUTEX_TYPE "Uses event mutexes" +#else +#error "ib_mutex_t type is unknown" +#endif /* MUTEX_FUTEX */ + +extern uint srv_spin_wait_delay; +extern ulong srv_n_spin_wait_rounds; + +#define mutex_create(I, M) mutex_init((M), (I), \ + __FILE__, __LINE__) + +#define mutex_enter_loc(M,file,line) (M)->enter( \ + uint32_t(srv_n_spin_wait_rounds), \ + uint32_t(srv_spin_wait_delay), \ + file, line) +#define mutex_enter(M) mutex_enter_loc(M, __FILE__, __LINE__) + +#define mutex_enter_nospin(M) (M)->enter( \ + 0, \ + 0, \ + __FILE__, uint32_t(__LINE__)) + +#define mutex_enter_nowait(M) (M)->trylock(__FILE__, \ + uint32_t(__LINE__)) + +#define mutex_exit(M) (M)->exit() + +#define mutex_free(M) mutex_destroy(M) + +#ifdef UNIV_DEBUG +/** +Checks that the mutex has been initialized. */ +#define mutex_validate(M) (M)->validate() + +/** +Checks that the current thread owns the mutex. Works only +in the debug version. */ +#define mutex_own(M) (M)->is_owned() +#else +#define mutex_own(M) /* No op */ +#define mutex_validate(M) /* No op */ +#endif /* UNIV_DEBUG */ + +/** Iterate over the mutex meta data */ +class MutexMonitor { +public: + /** Constructor */ + MutexMonitor() { } + + /** Destructor */ + ~MutexMonitor() { } + + /** Enable the mutex monitoring */ + void enable(); + + /** Disable the mutex monitoring */ + void disable(); + + /** Reset the mutex monitoring values */ + void reset(); + + /** Invoke the callback for each active mutex collection + @param[in,out] callback Functor to call + @return false if callback returned false */ + template<typename Callback> + bool iterate(Callback& callback) const + UNIV_NOTHROW + { + LatchMetaData::iterator end = latch_meta.end(); + + for (LatchMetaData::iterator it = latch_meta.begin(); + it != end; + ++it) { + + /* Some of the slots will be null in non-debug mode */ + + if (latch_meta_t* l= *it) { + if (!callback(*l)) { + return false; + } + } + } + + return(true); + } +}; + +/** Defined in sync0sync.cc */ +extern MutexMonitor mutex_monitor; + +/** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. +Add the mutex instance to the global mutex list. +@param[in,out] mutex mutex to initialise +@param[in] id The mutex ID (Latch ID) +@param[in] filename Filename from where it was called +@param[in] line Line number in filename from where called */ +template <typename Mutex> +void mutex_init( + Mutex* mutex, + latch_id_t id, + const char* file_name, + uint32_t line) +{ + new(mutex) Mutex(); + + mutex->init(id, file_name, line); +} + +/** +Removes a mutex instance from the mutex list. The mutex is checked to +be in the reset state. +@param[in,out] mutex mutex instance to destroy */ +template <typename Mutex> +void mutex_destroy( + Mutex* mutex) +{ + mutex->destroy(); +} + +#endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h new file mode 100644 index 00000000..e8469db9 --- /dev/null +++ b/storage/innobase/include/ut0new.h @@ -0,0 +1,1105 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ut/ut0new.h +Instrumented memory allocator. + +Created May 26, 2014 Vasil Dimov +*******************************************************/ + +/** Dynamic memory allocation within InnoDB guidelines. +All dynamic (heap) memory allocations (malloc(3), strdup(3), etc, "new", +various std:: containers that allocate memory internally), that are done +within InnoDB are instrumented. This means that InnoDB uses a custom set +of functions for allocating memory, rather than calling e.g. "new" directly. + +Here follows a cheat sheet on what InnoDB functions to use whenever a +standard one would have been used. + +Creating new objects with "new": +-------------------------------- +Standard: + new expression + or + new(std::nothrow) expression +InnoDB, default instrumentation: + UT_NEW_NOKEY(expression) +InnoDB, custom instrumentation, preferred: + UT_NEW(expression, key) + +Destroying objects, created with "new": +--------------------------------------- +Standard: + delete ptr +InnoDB: + UT_DELETE(ptr) + +Creating new arrays with "new[]": +--------------------------------- +Standard: + new type[num] + or + new(std::nothrow) type[num] +InnoDB, default instrumentation: + UT_NEW_ARRAY_NOKEY(type, num) +InnoDB, custom instrumentation, preferred: + UT_NEW_ARRAY(type, num, key) + +Destroying arrays, created with "new[]": +---------------------------------------- +Standard: + delete[] ptr +InnoDB: + UT_DELETE_ARRAY(ptr) + +Declaring a type with a std:: container, e.g. std::vector: +---------------------------------------------------------- +Standard: + std::vector<t> +InnoDB: + std::vector<t, ut_allocator<t> > + +Declaring objects of some std:: type: +------------------------------------- +Standard: + std::vector<t> v +InnoDB, default instrumentation: + std::vector<t, ut_allocator<t> > v +InnoDB, custom instrumentation, preferred: + std::vector<t, ut_allocator<t> > v(ut_allocator<t>(key)) + +Raw block allocation (as usual in C++, consider whether using "new" would +not be more appropriate): +------------------------------------------------------------------------- +Standard: + malloc(num) +InnoDB, default instrumentation: + ut_malloc_nokey(num) +InnoDB, custom instrumentation, preferred: + ut_malloc(num, key) + +Raw block resize: +----------------- +Standard: + realloc(ptr, new_size) +InnoDB: + ut_realloc(ptr, new_size) + +Raw block deallocation: +----------------------- +Standard: + free(ptr) +InnoDB: + ut_free(ptr) + +Note: the expression passed to UT_NEW() or UT_NEW_NOKEY() must always end +with (), thus: +Standard: + new int +InnoDB: + UT_NEW_NOKEY(int()) +*/ + +#ifndef ut0new_h +#define ut0new_h + +#include <algorithm> /* std::min() */ +#include <limits> /* std::numeric_limits */ +#include <map> /* std::map */ + +#include <stddef.h> +#include <stdlib.h> /* malloc() */ +#include <string.h> /* strlen(), strrchr(), strncmp() */ + +#include <my_sys.h> /* my_large_free/malloc() */ + +#include "my_global.h" /* needed for headers from mysql/psi/ */ + +#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */ + +#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */ + +#include "os0thread.h" /* os_thread_sleep() */ +#include "ut0ut.h" /* ut_strcmp_functor, ut_basename_noext() */ + +#define OUT_OF_MEMORY_MSG \ + "Check if you should increase the swap file or ulimits of your" \ + " operating system. Note that on most 32-bit computers the process" \ + " memory space is limited to 2 GB or 4 GB." + +/** The total amount of memory currently allocated from the operating +system with allocate_large() */ +extern Atomic_counter<ulint> os_total_large_mem_allocated; + +/** Maximum number of retries to allocate memory. */ +extern const size_t alloc_max_retries; + +constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU; + +/** Keys for registering allocations with performance schema. +Pointers to these variables are supplied to PFS code via the pfs_info[] +array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)(). +mem_key_other and mem_key_std are special in the following way (see also +ut_allocator::get_mem_key()): +* If the caller has not provided a key and the file name of the caller is + unknown, then mem_key_std will be used. This happens only when called from + within std::* containers. +* If the caller has not provided a key and the file name of the caller is + known, but is not amongst the predefined names (see ut_new_boot()) then + mem_key_other will be used. Generally this should not happen and if it + happens then that means that the list of predefined names must be extended. +Keep this list alphabetically sorted. */ +extern PSI_memory_key mem_key_ahi; +extern PSI_memory_key mem_key_buf_buf_pool; +extern PSI_memory_key mem_key_dict_stats_bg_recalc_pool_t; +extern PSI_memory_key mem_key_dict_stats_index_map_t; +extern PSI_memory_key mem_key_dict_stats_n_diff_on_level; +extern PSI_memory_key mem_key_other; +extern PSI_memory_key mem_key_row_log_buf; +extern PSI_memory_key mem_key_row_merge_sort; +extern PSI_memory_key mem_key_std; + +/** Setup the internal objects needed for UT_NEW() to operate. +This must be called before the first call to UT_NEW(). */ +void +ut_new_boot(); + +#ifdef UNIV_PFS_MEMORY + +/** +Retrieve a memory key (registered with PFS), +given AUTOEVENT_IDX of the caller + +@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller +@return registered memory key or PSI_NOT_INSTRUMENTED */ +PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx); + +#endif /* UNIV_PFS_MEMORY */ + +/** A structure that holds the necessary data for performance schema +accounting. An object of this type is put in front of each allocated block +of memory when allocation is done by ut_allocator::allocate(). This is +because the data is needed even when freeing the memory. Users of +ut_allocator::allocate_large() are responsible for maintaining this +themselves. */ +struct ut_new_pfx_t { + +#ifdef UNIV_PFS_MEMORY + + /** Performance schema key. Assigned to a name at startup via + PSI_MEMORY_CALL(register_memory)() and later used for accounting + allocations and deallocations with + PSI_MEMORY_CALL(memory_alloc)(key, size, owner) and + PSI_MEMORY_CALL(memory_free)(key, size, owner). */ + PSI_memory_key m_key; + + /** + Thread owner. + Instrumented thread that owns the allocated memory. + This state is used by the performance schema to maintain + per thread statistics, + when memory is given from thread A to thread B. + */ + struct PSI_thread *m_owner; + +#endif /* UNIV_PFS_MEMORY */ + + /** Size of the allocated block in bytes, including this prepended + aux structure (for ut_allocator::allocate()). For example if InnoDB + code requests to allocate 100 bytes, and sizeof(ut_new_pfx_t) is 16, + then 116 bytes are allocated in total and m_size will be 116. + ut_allocator::allocate_large() does not prepend this struct to the + allocated block and its users are responsible for maintaining it + and passing it later to ut_allocator::deallocate_large(). */ + size_t m_size; +#if SIZEOF_VOIDP == 4 + /** Pad the header size to a multiple of 64 bits on 32-bit systems, + so that the payload will be aligned to 64 bits. */ + size_t pad; +#endif +}; + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump) +{ + ut_a(ptr != NULL); + + if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) { + ib::warn() << "Failed to set memory to " DONTDUMP_STR ": " + << strerror(errno) + << " ptr " << ptr + << " size " << m_size; + } +} + +static inline void ut_dodump(void* ptr, size_t m_size) +{ + if (ptr && madvise(ptr, m_size, MADV_DODUMP)) { + ib::warn() << "Failed to set memory to " DODUMP_STR ": " + << strerror(errno) + << " ptr " << ptr + << " size " << m_size; + } +} +#else +static inline void ut_dontdump(void *, size_t, bool) {} +static inline void ut_dodump(void*, size_t) {} +#endif + +/** Allocator class for allocating memory from inside std::* containers. +@tparam T type of allocated object +@tparam oom_fatal whether to commit suicide when running out of memory */ +template <class T, bool oom_fatal = true> +class ut_allocator { +public: + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + +#ifdef UNIV_PFS_MEMORY + /** Default constructor. */ + explicit + ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED) + : m_key(key) + { + } +#else + ut_allocator() {} + ut_allocator(PSI_memory_key) {} +#endif /* UNIV_PFS_MEMORY */ + + /** Constructor from allocator of another type. */ + template <class U> + ut_allocator(const ut_allocator<U>& +#ifdef UNIV_PFS_MEMORY + other +#endif + ) + { +#ifdef UNIV_PFS_MEMORY + const PSI_memory_key other_key = other.get_mem_key(); + + m_key = (other_key != mem_key_std) + ? other_key + : PSI_NOT_INSTRUMENTED; +#endif /* UNIV_PFS_MEMORY */ + } + + /** Return the maximum number of objects that can be allocated by + this allocator. */ + size_type + max_size() const + { + const size_type s_max = std::numeric_limits<size_type>::max(); + +#ifdef UNIV_PFS_MEMORY + return((s_max - sizeof(ut_new_pfx_t)) / sizeof(T)); +#else + return(s_max / sizeof(T)); +#endif /* UNIV_PFS_MEMORY */ + } + + pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); } + + /** Allocate a chunk of memory that can hold 'n_elements' objects of + type 'T' and trace the allocation. + If the allocation fails this method may throw an exception. This + is mandated by the standard and if it returns NULL instead, then + STL containers that use it (e.g. std::vector) may get confused. + After successfull allocation the returned pointer must be passed + to ut_allocator::deallocate() when no longer needed. + @param[in] n_elements number of elements + @param[in] set_to_zero if true, then the returned memory is + initialized with 0x0 bytes. + @param[in] throw_on_error if true, raize exception if too big + @return pointer to the allocated memory */ + pointer + allocate( + size_type n_elements, + const_pointer, + uint32_t +#ifdef UNIV_PFS_MEMORY + autoevent_idx /* AUTOEVENT_IDX of the caller */ +#endif + , + bool set_to_zero = false, + bool throw_on_error = true) + { + if (n_elements == 0) { + return(NULL); + } + + if (n_elements > max_size()) { + if (throw_on_error) { + throw(std::bad_alloc()); + } else { + return(NULL); + } + } + + void* ptr; + size_t total_bytes = n_elements * sizeof(T); + +#ifdef UNIV_PFS_MEMORY + /* The header size must not ruin the 64-bit alignment + on 32-bit systems. Some allocated structures use + 64-bit fields. */ + ut_ad((sizeof(ut_new_pfx_t) & 7) == 0); + total_bytes += sizeof(ut_new_pfx_t); +#endif /* UNIV_PFS_MEMORY */ + + for (size_t retries = 1; ; retries++) { + + if (set_to_zero) { + ptr = calloc(1, total_bytes); + } else { + ptr = malloc(total_bytes); + } + + if (ptr != NULL || retries >= alloc_max_retries) { + break; + } + + os_thread_sleep(1000000 /* 1 second */); + } + + if (ptr == NULL) { + ib::fatal_or_error(oom_fatal) + << "Cannot allocate " << total_bytes + << " bytes of memory after " + << alloc_max_retries << " retries over " + << alloc_max_retries << " seconds. OS error: " + << strerror(errno) << " (" << errno << "). " + << OUT_OF_MEMORY_MSG; + if (throw_on_error) { + throw(std::bad_alloc()); + } else { + return(NULL); + } + } + +#ifdef UNIV_PFS_MEMORY + ut_new_pfx_t* pfx = static_cast<ut_new_pfx_t*>(ptr); + + allocate_trace(total_bytes, autoevent_idx, pfx); + + return(reinterpret_cast<pointer>(pfx + 1)); +#else + return(reinterpret_cast<pointer>(ptr)); +#endif /* UNIV_PFS_MEMORY */ + } + + /** Free a memory allocated by allocate() and trace the deallocation. + @param[in,out] ptr pointer to memory to free */ + void deallocate(pointer ptr, size_type n_elements = 0) + { +#ifdef UNIV_PFS_MEMORY + if (ptr == NULL) { + return; + } + + ut_new_pfx_t* pfx = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1; + + deallocate_trace(pfx); + + free(pfx); +#else + free(ptr); +#endif /* UNIV_PFS_MEMORY */ + } + + /** Create an object of type 'T' using the value 'val' over the + memory pointed by 'p'. */ + void + construct( + pointer p, + const T& val) + { + new(p) T(val); + } + + /** Destroy an object pointed by 'p'. */ + void + destroy( + pointer p) + { + p->~T(); + } + + /** Return the address of an object. */ + pointer + address( + reference x) const + { + return(&x); + } + + /** Return the address of a const object. */ + const_pointer + address( + const_reference x) const + { + return(&x); + } + + template <class U> + struct rebind { + typedef ut_allocator<U> other; + }; + + /* The following are custom methods, not required by the standard. */ + +#ifdef UNIV_PFS_MEMORY + + /** realloc(3)-like method. + The passed in ptr must have been returned by allocate() and the + pointer returned by this method must be passed to deallocate() when + no longer needed. + @param[in,out] ptr old pointer to reallocate + @param[in] n_elements new number of elements to allocate + @param[in] file file name of the caller + @return newly allocated memory */ + pointer + reallocate( + void* ptr, + size_type n_elements, + uint32_t autoevent_idx) + { + if (n_elements == 0) { + deallocate(static_cast<pointer>(ptr)); + return(NULL); + } + + if (ptr == NULL) { + return(allocate(n_elements, NULL, autoevent_idx, false, false)); + } + + if (n_elements > max_size()) { + return(NULL); + } + + ut_new_pfx_t* pfx_old; + ut_new_pfx_t* pfx_new; + size_t total_bytes; + + pfx_old = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1; + + total_bytes = n_elements * sizeof(T) + sizeof(ut_new_pfx_t); + + for (size_t retries = 1; ; retries++) { + + pfx_new = static_cast<ut_new_pfx_t*>( + realloc(pfx_old, total_bytes)); + + if (pfx_new != NULL || retries >= alloc_max_retries) { + break; + } + + os_thread_sleep(1000000 /* 1 second */); + } + + if (pfx_new == NULL) { + ib::fatal_or_error(oom_fatal) + << "Cannot reallocate " << total_bytes + << " bytes of memory after " + << alloc_max_retries << " retries over " + << alloc_max_retries << " seconds. OS error: " + << strerror(errno) << " (" << errno << "). " + << OUT_OF_MEMORY_MSG; + return(NULL); + } + + /* pfx_new still contains the description of the old block + that was presumably freed by realloc(). */ + deallocate_trace(pfx_new); + + /* pfx_new is set here to describe the new block. */ + allocate_trace(total_bytes, autoevent_idx, pfx_new); + + return(reinterpret_cast<pointer>(pfx_new + 1)); + } + + /** Allocate, trace the allocation and construct 'n_elements' objects + of type 'T'. If the allocation fails or if some of the constructors + throws an exception, then this method will return NULL. It does not + throw exceptions. After successfull completion the returned pointer + must be passed to delete_array() when no longer needed. + @param[in] n_elements number of elements to allocate + @param[in] file file name of the caller + @return pointer to the first allocated object or NULL */ + pointer + new_array( + size_type n_elements, + uint32_t autoevent_idx + ) + { + T* p = allocate(n_elements, NULL, autoevent_idx, false, false); + + if (p == NULL) { + return(NULL); + } + + T* first = p; + size_type i; + + try { + for (i = 0; i < n_elements; i++) { + new(p) T; + ++p; + } + } catch (...) { + for (size_type j = 0; j < i; j++) { + --p; + p->~T(); + } + + deallocate(first); + + throw; + } + + return(first); + } + + /** Destroy, deallocate and trace the deallocation of an array created + by new_array(). + @param[in,out] ptr pointer to the first object in the array */ + void + delete_array( + T* ptr) + { + if (ptr == NULL) { + return; + } + + const size_type n_elements = n_elements_allocated(ptr); + + T* p = ptr + n_elements - 1; + + for (size_type i = 0; i < n_elements; i++) { + p->~T(); + --p; + } + + deallocate(ptr); + } + +#endif /* UNIV_PFS_MEMORY */ + + /** Allocate a large chunk of memory that can hold 'n_elements' + objects of type 'T' and trace the allocation. + @param[in] n_elements number of elements + @param[in] dontdump if true, advise the OS is not to core + dump this memory. + @param[out] pfx storage for the description of the + allocated memory. The caller must provide space for this one and keep + it until the memory is no longer needed and then pass it to + deallocate_large(). + @return pointer to the allocated memory or NULL */ + pointer + allocate_large( + size_type n_elements, + ut_new_pfx_t* pfx, + bool dontdump = false) + { + if (n_elements == 0 || n_elements > max_size()) { + return(NULL); + } + + ulint n_bytes = n_elements * sizeof(T); + + pointer ptr = reinterpret_cast<pointer>( + my_large_malloc(&n_bytes, MYF(0))); + + if (ptr == NULL) { + return NULL; + } + + ut_dontdump(ptr, n_bytes, dontdump); + + if (pfx != NULL) { +#ifdef UNIV_PFS_MEMORY + allocate_trace(n_bytes, 0, pfx); +#endif /* UNIV_PFS_MEMORY */ + pfx->m_size = n_bytes; + } + + os_total_large_mem_allocated += n_bytes; + + return(ptr); + } + + pointer + allocate_large_dontdump( + size_type n_elements, + ut_new_pfx_t* pfx) + { + return allocate_large(n_elements, pfx, true); + } + /** Free a memory allocated by allocate_large() and trace the + deallocation. + @param[in,out] ptr pointer to memory to free + @param[in] pfx descriptor of the memory, as returned by + allocate_large(). */ + void + deallocate_large( + pointer ptr, + const ut_new_pfx_t* pfx) + { + size_t size = pfx->m_size; +#ifdef UNIV_PFS_MEMORY + if (pfx) { + deallocate_trace(pfx); + } +#endif /* UNIV_PFS_MEMORY */ + os_total_large_mem_allocated -= size; + + my_large_free(ptr, size); + } + + void + deallocate_large_dodump( + pointer ptr, + const ut_new_pfx_t* pfx) + { + ut_dodump(ptr, pfx->m_size); + deallocate_large(ptr, pfx); + } + +#ifdef UNIV_PFS_MEMORY + /** Get the performance schema key to use for tracing allocations. + @param[in] file file name of the caller or NULL if unknown + @return performance schema key */ + PSI_memory_key + get_mem_key( + uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const + { + if (m_key != PSI_NOT_INSTRUMENTED) { + return(m_key); + } + + if (autoevent_idx == INVALID_AUTOEVENT_IDX) { + return(mem_key_std); + } + const PSI_memory_key key = ut_new_get_key_by_file(autoevent_idx); + + if (key != PSI_NOT_INSTRUMENTED) { + return(key); + } + + return(mem_key_other); + } + +private: + + /** Retrieve the size of a memory block allocated by new_array(). + @param[in] ptr pointer returned by new_array(). + @return size of memory block */ + size_type + n_elements_allocated( + const_pointer ptr) + { + const ut_new_pfx_t* pfx + = reinterpret_cast<const ut_new_pfx_t*>(ptr) - 1; + + const size_type user_bytes + = pfx->m_size - sizeof(ut_new_pfx_t); + + ut_ad(user_bytes % sizeof(T) == 0); + + return(user_bytes / sizeof(T)); + } + + /** Trace a memory allocation. + After the accounting, the data needed for tracing the deallocation + later is written into 'pfx'. + The PFS event name is picked on the following criteria: + 1. If key (!= PSI_NOT_INSTRUMENTED) has been specified when constructing + this ut_allocator object, then the name associated with that key will + be used (this is the recommended approach for new code) + 2. Otherwise, if "file" is NULL, then the name associated with + mem_key_std will be used + 3. Otherwise, if an entry is found by ut_new_get_key_by_file(), that + corresponds to "file", that will be used (see ut_new_boot()) + 4. Otherwise, the name associated with mem_key_other will be used. + @param[in] size number of bytes that were allocated + @param[in] autoevent_idx autoevent_idx of the caller + @param[out] pfx placeholder to store the info which will be + needed when freeing the memory */ + void + allocate_trace( + size_t size, + const uint32_t autoevent_idx, + ut_new_pfx_t* pfx) + { + const PSI_memory_key key = get_mem_key(autoevent_idx); + + pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner); + pfx->m_size = size; + } + + /** Trace a memory deallocation. + @param[in] pfx info for the deallocation */ + void + deallocate_trace( + const ut_new_pfx_t* pfx) + { + PSI_MEMORY_CALL(memory_free)(pfx->m_key, pfx->m_size, pfx->m_owner); + } + + /** Performance schema key. */ + PSI_memory_key m_key; + +#endif /* UNIV_PFS_MEMORY */ + +private: + + /** Assignment operator, not used, thus disabled (private). */ + template <class U> + void + operator=( + const ut_allocator<U>&); +}; + +/** Compare two allocators of the same type. +As long as the type of A1 and A2 is the same, a memory allocated by A1 +could be freed by A2 even if the pfs mem key is different. */ +template <typename T> +inline +bool +operator==(const ut_allocator<T>&, const ut_allocator<T>&) { return(true); } + +/** Compare two allocators of the same type. */ +template <typename T> +inline +bool +operator!=( + const ut_allocator<T>& lhs, + const ut_allocator<T>& rhs) +{ + return(!(lhs == rhs)); +} + +#ifdef UNIV_PFS_MEMORY + +/* + constexpr trickery ahead. + + Compute AUTOEVENT_IDX at compile time. + (index in the auto_event_names array, corresponding to basename of __FILE__) + + The tricks are necessary to reduce the cost of lookup the + PSI_memory_key for auto event. +*/ + +static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash) +{ + return + *s == '\0' ? last_slash : + *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) : + cexpr_basename_helper(s + 1, last_slash); +} + +static constexpr const char* cexpr_basename(const char* filename) +{ + return cexpr_basename_helper(filename, filename); +} + +static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b) +{ + return *a == 0 || *a == '.' ? (*b == 0 || *b == '.') + : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false; +} + +constexpr const char* const auto_event_names[] = +{ + "btr0btr", + "btr0buf", + "btr0bulk", + "btr0cur", + "btr0pcur", + "btr0sea", + "buf0buf", + "buf0dblwr", + "buf0dump", + "dict0dict", + "dict0mem", + "dict0stats", + "eval0eval", + "fil0crypt", + "fil0fil", + "fsp0file", + "fts0ast", + "fts0blex", + "fts0config", + "fts0file", + "fts0fts", + "fts0opt", + "fts0pars", + "fts0que", + "fts0sql", + "fts0tlex", + "gis0sea", + "ha_innodb", + "handler0alter", + "hash0hash", + "i_s", + "lexyy", + "lock0lock", + "mem0mem", + "os0event", + "os0file", + "pars0lex", + "rem0rec", + "row0ftsort", + "row0import", + "row0log", + "row0merge", + "row0mysql", + "row0sel", + "srv0start", + "sync0arr", + "sync0debug", + "sync0rw", + "sync0start", + "sync0types", + "trx0i_s", + "trx0i_s", + "trx0roll", + "trx0rseg", + "trx0seg", + "trx0trx", + "trx0undo", + "ut0list", + "ut0mem", + "ut0new", + "ut0pool", + "ut0rbt", + "ut0wqueue", + "xtrabackup", + nullptr +}; + +constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0) +{ + return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX : + cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx : + cexpr_lookup_auto_event_name(name, idx + 1); +} + +/* + The AUTOEVENT_IDX macro. + + Note, that there is a static_assert that checks whether + basename of the __FILE is not registered in the auto_event_names array. + If you run into this assert, add the basename to the array. + + Weird looking lambda is used to force the evaluation at the compile time. +*/ +#define AUTOEVENT_IDX []()\ +{\ + constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \ + static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\ + return idx; \ +}() + + +/** Allocate, trace the allocation and construct an object. +Use this macro instead of 'new' within InnoDB. +For example: instead of + Foo* f = new Foo(args); +use: + Foo* f = UT_NEW(Foo(args), mem_key_some); +Upon failure to allocate the memory, this macro may return NULL. It +will not throw exceptions. After successfull allocation the returned +pointer must be passed to UT_DELETE() when no longer needed. +@param[in] expr any expression that could follow "new" +@param[in] key performance schema memory tracing key +@return pointer to the created object or NULL */ +#define UT_NEW(expr, key) \ + /* Placement new will return NULL and not attempt to construct an + object if the passed in pointer is NULL, e.g. if allocate() has + failed to allocate memory and has returned NULL. */ \ + ::new(ut_allocator<byte>(key).allocate( \ + sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr + +/** Allocate, trace the allocation and construct an object. +Use this macro instead of 'new' within InnoDB and instead of UT_NEW() +when creating a dedicated memory key is not feasible. +For example: instead of + Foo* f = new Foo(args); +use: + Foo* f = UT_NEW_NOKEY(Foo(args)); +Upon failure to allocate the memory, this macro may return NULL. It +will not throw exceptions. After successfull allocation the returned +pointer must be passed to UT_DELETE() when no longer needed. +@param[in] expr any expression that could follow "new" +@return pointer to the created object or NULL */ +#define UT_NEW_NOKEY(expr) UT_NEW(expr, PSI_NOT_INSTRUMENTED) + +/** Destroy, deallocate and trace the deallocation of an object created by +UT_NEW() or UT_NEW_NOKEY(). +We can't instantiate ut_allocator without having the type of the object, thus +we redirect this to a templated function. */ +#define UT_DELETE(ptr) ut_delete(ptr) + + +/** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY(). +@param[in,out] ptr pointer to the object */ +template <typename T> +inline +void +ut_delete( + T* ptr) +{ + if (ptr == NULL) { + return; + } + + ut_allocator<T> allocator; + + allocator.destroy(ptr); + allocator.deallocate(ptr); +} + +/** Allocate and account 'n_elements' objects of type 'type'. +Use this macro to allocate memory within InnoDB instead of 'new[]'. +The returned pointer must be passed to UT_DELETE_ARRAY(). +@param[in] type type of objects being created +@param[in] n_elements number of objects to create +@param[in] key performance schema memory tracing key +@return pointer to the first allocated object or NULL */ +#define UT_NEW_ARRAY(type, n_elements, key) \ + ut_allocator<type>(key).new_array(n_elements, AUTOEVENT_IDX) + +/** Allocate and account 'n_elements' objects of type 'type'. +Use this macro to allocate memory within InnoDB instead of 'new[]' and +instead of UT_NEW_ARRAY() when it is not feasible to create a dedicated key. +@param[in] type type of objects being created +@param[in] n_elements number of objects to create +@return pointer to the first allocated object or NULL */ +#define UT_NEW_ARRAY_NOKEY(type, n_elements) \ + UT_NEW_ARRAY(type, n_elements, PSI_NOT_INSTRUMENTED) + +/** Destroy, deallocate and trace the deallocation of an array created by +UT_NEW_ARRAY() or UT_NEW_ARRAY_NOKEY(). +We can't instantiate ut_allocator without having the type of the object, thus +we redirect this to a templated function. */ +#define UT_DELETE_ARRAY(ptr) ut_delete_array(ptr) + +/** Destroy and account objects created by UT_NEW_ARRAY() or +UT_NEW_ARRAY_NOKEY(). +@param[in,out] ptr pointer to the first object in the array */ +template <typename T> +inline +void +ut_delete_array( + T* ptr) +{ + ut_allocator<T>().delete_array(ptr); +} + +#define ut_malloc(n_bytes, key) static_cast<void*>( \ + ut_allocator<byte>(key).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, false, false)) + +#define ut_malloc_dontdump(n_bytes, key) static_cast<void*>( \ + ut_allocator<byte>(key).allocate_large( \ + n_bytes, NULL, true)) + +#define ut_zalloc(n_bytes, key) static_cast<void*>( \ + ut_allocator<byte>(key).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, true, false)) + +#define ut_malloc_nokey(n_bytes) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, false, false)) + +#define ut_zalloc_nokey(n_bytes) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, true, false)) + +#define ut_zalloc_nokey_nofatal(n_bytes) static_cast<void*>( \ + ut_allocator<byte, false>(PSI_NOT_INSTRUMENTED).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, true, false)) + +#define ut_realloc(ptr, n_bytes) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).reallocate( \ + ptr, n_bytes, AUTOEVENT_IDX)) + +#define ut_free(ptr) ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \ + reinterpret_cast<byte*>(ptr)) + +#else /* UNIV_PFS_MEMORY */ + +/* Fallbacks when memory tracing is disabled at compile time. */ + +#define UT_NEW(expr, key) ::new(std::nothrow) expr +#define UT_NEW_NOKEY(expr) ::new(std::nothrow) expr +#define UT_DELETE(ptr) ::delete ptr + +#define UT_NEW_ARRAY(type, n_elements, key) \ + ::new(std::nothrow) type[n_elements] + +#define UT_NEW_ARRAY_NOKEY(type, n_elements) \ + ::new(std::nothrow) type[n_elements] + +#define UT_DELETE_ARRAY(ptr) ::delete[] ptr + +#define ut_malloc(n_bytes, key) ::malloc(n_bytes) + +#define ut_zalloc(n_bytes, key) ::calloc(1, n_bytes) + +#define ut_malloc_nokey(n_bytes) ::malloc(n_bytes) + +static inline void *ut_malloc_dontdump(size_t n_bytes, ...) +{ + void *ptr = my_large_malloc(&n_bytes, MYF(0)); + + ut_dontdump(ptr, n_bytes, true); + + if (ptr) { + os_total_large_mem_allocated += n_bytes; + } + return ptr; +} + +#define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes) + +#define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes) + +#define ut_realloc(ptr, n_bytes) ::realloc(ptr, n_bytes) + +#define ut_free(ptr) ::free(ptr) + +#endif /* UNIV_PFS_MEMORY */ + +static inline void ut_free_dodump(void *ptr, size_t size) +{ + ut_dodump(ptr, size); + os_total_large_mem_allocated -= size; + my_large_free(ptr, size); +} + +#endif /* ut0new_h */ diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h new file mode 100644 index 00000000..e0a1f7c0 --- /dev/null +++ b/storage/innobase/include/ut0pool.h @@ -0,0 +1,363 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0pool.h +Object pool. + +Created 2012-Feb-26 Sunny Bains +***********************************************************************/ + +#ifndef ut0pool_h +#define ut0pool_h + +#include <vector> +#include <queue> +#include <functional> + +#include "ut0new.h" + +/** Allocate the memory for the object in blocks. We keep the objects sorted +on pointer so that they are closer together in case they have to be iterated +over in a list. */ +template <typename Type, typename Factory, typename LockStrategy> +struct Pool { + + typedef Type value_type; + + // FIXME: Add an assertion to check alignment and offset is + // as we expect it. Also, sizeof(void*) can be 8, can we impove on this. + struct Element { + Pool* m_pool; + value_type m_type; + }; + + /** Constructor + @param size size of the memory block */ + Pool(size_t size) + : + m_end(), + m_start(), + m_size(size), + m_last() + { + ut_a(size >= sizeof(Element)); + + m_lock_strategy.create(); + + ut_a(m_start == 0); + + m_start = reinterpret_cast<Element*>(ut_zalloc_nokey(m_size)); + + m_last = m_start; + + m_end = &m_start[m_size / sizeof(*m_start)]; + + /* Note: Initialise only a small subset, even though we have + allocated all the memory. This is required only because PFS + (MTR) results change if we instantiate too many mutexes up + front. */ + + init(ut_min(size_t(16), size_t(m_end - m_start))); + + ut_ad(m_pqueue.size() <= size_t(m_last - m_start)); + } + + /** Destructor */ + ~Pool() + { + m_lock_strategy.destroy(); + + for (Element* elem = m_start; elem != m_last; ++elem) { + + ut_ad(elem->m_pool == this); + Factory::destroy(&elem->m_type); + } + + ut_free(m_start); + m_end = m_last = m_start = 0; + m_size = 0; + } + + /** Get an object from the pool. + @retrun a free instance or NULL if exhausted. */ + Type* get() + { + Element* elem; + + m_lock_strategy.enter(); + + if (!m_pqueue.empty()) { + + elem = m_pqueue.top(); + m_pqueue.pop(); + + } else if (m_last < m_end) { + + /* Initialise the remaining elements. */ + init(size_t(m_end - m_last)); + + ut_ad(!m_pqueue.empty()); + + elem = m_pqueue.top(); + m_pqueue.pop(); + } else { + elem = NULL; + } + + m_lock_strategy.exit(); + return elem ? &elem->m_type : NULL; + } + + /** Add the object to the pool. + @param ptr object to free */ + static void mem_free(value_type* ptr) + { + Element* elem; + byte* p = reinterpret_cast<byte*>(ptr + 1); + + elem = reinterpret_cast<Element*>(p - sizeof(*elem)); + + elem->m_pool->m_lock_strategy.enter(); + + elem->m_pool->putl(elem); + + elem->m_pool->m_lock_strategy.exit(); + } + +protected: + // Disable copying + Pool(const Pool&); + Pool& operator=(const Pool&); + +private: + + /* We only need to compare on pointer address. */ + typedef std::priority_queue< + Element*, + std::vector<Element*, ut_allocator<Element*> >, + std::greater<Element*> > pqueue_t; + + /** Release the object to the free pool + @param elem element to free */ + void putl(Element* elem) + { + ut_ad(elem >= m_start && elem < m_last); + m_pqueue.push(elem); + } + + /** Initialise the elements. + @param n_elems Number of elements to initialise */ + void init(size_t n_elems) + { + ut_ad(size_t(m_end - m_last) >= n_elems); + + for (size_t i = 0; i < n_elems; ++i, ++m_last) { + + m_last->m_pool = this; + Factory::init(&m_last->m_type); + m_pqueue.push(m_last); + } + + ut_ad(m_last <= m_end); + } + +private: + /** Pointer to the last element */ + Element* m_end; + + /** Pointer to the first element */ + Element* m_start; + + /** Size of the block in bytes */ + size_t m_size; + + /** Upper limit of used space */ + Element* m_last; + + /** Priority queue ordered on the pointer addresse. */ + pqueue_t m_pqueue; + + /** Lock strategy to use */ + LockStrategy m_lock_strategy; +}; + +template <typename Pool, typename LockStrategy> +struct PoolManager { + + typedef Pool PoolType; + typedef typename PoolType::value_type value_type; + + PoolManager(size_t size) + : + m_size(size) + { + create(); + } + + ~PoolManager() + { + destroy(); + + ut_a(m_pools.empty()); + } + + /** Get an element from one of the pools. + @return instance or NULL if pool is empty. */ + value_type* get() + { + size_t index = 0; + size_t delay = 1; + value_type* ptr = NULL; + + do { + m_lock_strategy.enter(); + + ut_ad(!m_pools.empty()); + + size_t n_pools = m_pools.size(); + + PoolType* pool = m_pools[index % n_pools]; + + m_lock_strategy.exit(); + + ptr = pool->get(); + + if (ptr == 0 && (index / n_pools) > 2) { + + if (!add_pool(n_pools)) { + + ib::error() << "Failed to allocate" + " memory for a pool of size " + << m_size << " bytes. Will" + " wait for " << delay + << " seconds for a thread to" + " free a resource"; + + /* There is nothing much we can do + except crash and burn, however lets + be a little optimistic and wait for + a resource to be freed. */ + os_thread_sleep(delay * 1000000); + + if (delay < 32) { + delay <<= 1; + } + + } else { + delay = 1; + } + } + + ++index; + + } while (ptr == NULL); + + return(ptr); + } + + static void mem_free(value_type* ptr) + { + PoolType::mem_free(ptr); + } + +private: + /** Add a new pool + @param n_pools Number of pools that existed when the add pool was + called. + @return true on success */ + bool add_pool(size_t n_pools) + { + bool added = false; + + m_lock_strategy.enter(); + + if (n_pools < m_pools.size()) { + /* Some other thread already added a pool. */ + added = true; + } else { + PoolType* pool; + + ut_ad(n_pools == m_pools.size()); + + pool = UT_NEW_NOKEY(PoolType(m_size)); + + if (pool != NULL) { + + ut_ad(n_pools <= m_pools.size()); + + m_pools.push_back(pool); + + ib::info() << "Number of pools: " + << m_pools.size(); + + added = true; + } + } + + ut_ad(n_pools < m_pools.size() || !added); + + m_lock_strategy.exit(); + + return(added); + } + + /** Create the pool manager. */ + void create() + { + ut_a(m_size > sizeof(value_type)); + m_lock_strategy.create(); + + add_pool(0); + } + + /** Release the resources. */ + void destroy() + { + typename Pools::iterator it; + typename Pools::iterator end = m_pools.end(); + + for (it = m_pools.begin(); it != end; ++it) { + PoolType* pool = *it; + + UT_DELETE(pool); + } + + m_pools.clear(); + + m_lock_strategy.destroy(); + } +private: + // Disable copying + PoolManager(const PoolManager&); + PoolManager& operator=(const PoolManager&); + + typedef std::vector<PoolType*, ut_allocator<PoolType*> > Pools; + + /** Size of each block */ + size_t m_size; + + /** Pools managed this manager */ + Pools m_pools; + + /** Lock strategy to use */ + LockStrategy m_lock_strategy; +}; + +#endif /* ut0pool_h */ diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h new file mode 100644 index 00000000..38071165 --- /dev/null +++ b/storage/innobase/include/ut0rbt.h @@ -0,0 +1,254 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/******************************************************************//** +@file include/ut0rbt.h +Various utilities + +Created 2007-03-20 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0RBT_H +#define INNOBASE_UT0RBT_H + +#if !defined(IB_RBT_TESTING) +#include "ut0mem.h" +#else +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#define ut_malloc malloc +#define ut_free free +#define ulint unsigned long +#define ut_a(c) assert(c) +#define ut_error assert(0) +#define ibool unsigned int +#define TRUE 1 +#define FALSE 0 +#endif + +struct ib_rbt_node_t; +typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node); +typedef int (*ib_rbt_compare)(const void* p1, const void* p2); +typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2); + +/** Red black tree color types */ +enum ib_rbt_color_t { + IB_RBT_RED, + IB_RBT_BLACK +}; + +/** Red black tree node */ +struct ib_rbt_node_t { + ib_rbt_color_t color; /* color of this node */ + + ib_rbt_node_t* left; /* points left child */ + ib_rbt_node_t* right; /* points right child */ + ib_rbt_node_t* parent; /* points parent node */ + + char value[1]; /* Data value */ +}; + +/** Red black tree instance.*/ +struct ib_rbt_t { + ib_rbt_node_t* nil; /* Black colored node that is + used as a sentinel. This is + pre-allocated too.*/ + + ib_rbt_node_t* root; /* Root of the tree, this is + pre-allocated and the first + data node is the left child.*/ + + ulint n_nodes; /* Total number of data nodes */ + + ib_rbt_compare compare; /* Fn. to use for comparison */ + ib_rbt_arg_compare + compare_with_arg; /* Fn. to use for comparison + with argument */ + ulint sizeof_value; /* Sizeof the item in bytes */ + void* cmp_arg; /* Compare func argument */ +}; + +/** The result of searching for a key in the tree, this is useful for +a speedy lookup and insert if key doesn't exist.*/ +struct ib_rbt_bound_t { + const ib_rbt_node_t* + last; /* Last node visited */ + + int result; /* Result of comparing with + the last non-nil node that + was visited */ +}; + +/* Size in elements (t is an rb tree instance) */ +#define rbt_size(t) (t->n_nodes) + +/* Check whether the rb tree is empty (t is an rb tree instance) */ +#define rbt_empty(t) (rbt_size(t) == 0) + +/* Get data value (t is the data type, n is an rb tree node instance) */ +#define rbt_value(t, n) ((t*) &n->value[0]) + +/* Compare a key with the node value (t is tree, k is key, n is node)*/ +#define rbt_compare(t, k, n) (t->compare(k, n->value)) + +/**********************************************************************//** +Free an instance of a red black tree */ +void +rbt_free( +/*=====*/ + ib_rbt_t* tree); /*!< in: rb tree to free */ +/**********************************************************************//** +Create an instance of a red black tree +@return rb tree instance */ +ib_rbt_t* +rbt_create( +/*=======*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_compare compare); /*!< in: comparator */ +/**********************************************************************//** +Create an instance of a red black tree, whose comparison function takes +an argument +@return rb tree instance */ +ib_rbt_t* +rbt_create_arg_cmp( +/*===============*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_arg_compare + compare, /*!< in: comparator */ + void* cmp_arg); /*!< in: compare fn arg */ +/**********************************************************************//** +Delete a node from the red black tree, identified by key */ +ibool +rbt_delete( +/*=======*/ + /* in: TRUE on success */ + ib_rbt_t* tree, /* in: rb tree */ + const void* key); /* in: key to delete */ +/**********************************************************************//** +Remove a node from the red black tree, NOTE: This function will not delete +the node instance, THAT IS THE CALLERS RESPONSIBILITY. +@return the deleted node with the const. */ +ib_rbt_node_t* +rbt_remove_node( +/*============*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* + node); /*!< in: node to delete, this + is a fudge and declared const + because the caller has access + only to const nodes.*/ +/**********************************************************************//** +Add data to the red black tree, identified by key (no dups yet!) +@return inserted node */ +const ib_rbt_node_t* +rbt_insert( +/*=======*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const void* key, /*!< in: key for ordering */ + const void* value); /*!< in: data that will be + copied to the node.*/ +/**********************************************************************//** +Add a new node to the tree, useful for data that is pre-sorted. +@return appended node */ +const ib_rbt_node_t* +rbt_add_node( +/*=========*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + const void* value); /*!< in: this value is copied + to the node */ +/**********************************************************************//** +Return the left most data node in the tree +@return left most node */ +const ib_rbt_node_t* +rbt_first( +/*======*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the right most data node in the tree +@return right most node */ +const ib_rbt_node_t* +rbt_last( +/*=====*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the next node from current. +@return successor node to current that is passed in. */ +const ib_rbt_node_t* +rbt_next( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Return the prev node from current. +@return precedessor node to current that is passed in */ +const ib_rbt_node_t* +rbt_prev( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +int +rbt_search( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +int +rbt_search_cmp( +/*===========*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key, /*!< in: key to search */ + ib_rbt_compare compare, /*!< in: comparator */ + ib_rbt_arg_compare + arg_compare); /*!< in: fn to compare items + with argument */ +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +@return no. of recs merged */ +ulint +rbt_merge_uniq( +/*===========*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + const ib_rbt_t* src); /*!< in: src rb tree */ +#if defined UNIV_DEBUG || defined IB_RBT_TESTING +/**********************************************************************//** +Verify the integrity of the RB tree. For debugging. 0 failure else height +of tree (in count of black nodes). +@return TRUE if OK FALSE if tree invalid. */ +ibool +rbt_validate( +/*=========*/ + const ib_rbt_t* tree); /*!< in: tree to validate */ +#endif /* UNIV_DEBUG || IB_RBT_TESTING */ + +#endif /* INNOBASE_UT0RBT_H */ diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h new file mode 100644 index 00000000..5b1ae5bc --- /dev/null +++ b/storage/innobase/include/ut0rnd.h @@ -0,0 +1,137 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0rnd.h +Random numbers and hashing + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0rnd_h +#define ut0rnd_h + +#include "ut0byte.h" +#include <my_sys.h> + +#ifndef UNIV_INNOCHECKSUM +/** Seed value of ut_rnd_gen() */ +extern std::atomic<uint32_t> ut_rnd_current; + +/** @return a pseudo-random 32-bit number */ +inline uint32_t ut_rnd_gen() +{ + /* This is a Galois linear-feedback shift register. + https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Galois_LFSRs + The generating primitive Galois Field polynomial is the Castagnoli + polynomial that was made popular by CRC-32C: + x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+ + x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */ + const uint32_t crc32c= 0x1edc6f41; + + uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed); + + if (UNIV_UNLIKELY(rnd == 0)) + { + rnd= static_cast<uint32_t>(my_interval_timer()); + if (!rnd) rnd= 1; + } + else + { + bool lsb= rnd & 1; + rnd>>= 1; + if (lsb) + rnd^= crc32c; + } + + ut_rnd_current.store(rnd, std::memory_order_relaxed); + return rnd; +} + +/** @return a random number between 0 and n-1, inclusive */ +inline ulint ut_rnd_interval(ulint n) +{ + return n > 1 ? static_cast<ulint>(ut_rnd_gen() % n) : 0; +} + +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime or some +random number to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size); /*!< in: hash table size */ +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ + MY_ATTRIBUTE((const)); +/*************************************************************//** +Folds a character string ending in the null character. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + const char* str) /*!< in: null-terminated string */ + MY_ATTRIBUTE((warn_unused_result)); +/***********************************************************//** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. +@return prime */ +ulint +ut_find_prime( +/*==========*/ + ulint n) /*!< in: positive number > 100 */ + MY_ATTRIBUTE((const)); + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ + MY_ATTRIBUTE((const)); +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ + MY_ATTRIBUTE((pure)); + +#include "ut0rnd.ic" + +#endif diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic new file mode 100644 index 00000000..c0105160 --- /dev/null +++ b/storage/innobase/include/ut0rnd.ic @@ -0,0 +1,150 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0rnd.ic +Random numbers and hashing + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#define UT_HASH_RANDOM_MASK 1463735687 +#define UT_HASH_RANDOM_MASK2 1653893711 + +#ifndef UNIV_INNOCHECKSUM + +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime +or some random number for the hash table to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size) /*!< in: hash table size */ +{ + ut_ad(table_size); + key = key ^ UT_HASH_RANDOM_MASK2; + + return(key % table_size); +} + +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ +{ + return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK, + (ulint) (d >> 32))); +} + +/*************************************************************//** +Folds a character string ending in the null character. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + const char* str) /*!< in: null-terminated string */ +{ + ulint fold = 0; + + ut_ad(str); + + while (*str != '\0') { + fold = ut_fold_ulint_pair(fold, (ulint)(*str)); + str++; + } + + return(fold); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ +{ + return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) + ^ UT_HASH_RANDOM_MASK) + n2); +} + +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ +{ + ulint fold = 0; + const byte* str_end = str + (len & 0xFFFFFFF8); + + ut_ad(str || !len); + + while (str < str_end) { + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + switch (len & 0x7) { + case 7: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 6: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 5: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 4: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 3: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 2: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 1: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + return(fold); +} diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h new file mode 100644 index 00000000..4f1d4c04 --- /dev/null +++ b/storage/innobase/include/ut0sort.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0sort.h +Sort utility + +Created 11/9/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0sort_h +#define ut0sort_h + +/* This module gives a macro definition of the body of +a standard sort function for an array of elements of any +type. The comparison function is given as a parameter to +the macro. The sort algorithm is mergesort which has logarithmic +worst case. +*/ + +/*******************************************************************//** +This macro expands to the body of a standard sort function. +The sort function uses mergesort and must be defined separately +for each type of array. +Also the comparison function has to be defined individually +for each array cell type. SORT_FUN is the sort function name. +The function takes the array to be sorted (ARR), +the array of auxiliary space (AUX_ARR) of same size, +and the low (LOW), inclusive, and high (HIGH), noninclusive, +limits for the sort interval as arguments. +CMP_FUN is the comparison function name. It takes as arguments +two elements from the array and returns 1, if the first is bigger, +0 if equal, and -1 if the second bigger. */ + +#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\ +{\ + ulint ut_sort_mid77;\ + ulint ut_sort_i77;\ + ulint ut_sort_low77;\ + ulint ut_sort_high77;\ +\ + ut_ad((LOW) < (HIGH));\ + ut_ad(ARR);\ + ut_ad(AUX_ARR);\ +\ + if ((LOW) == (HIGH) - 1) {\ + return;\ + } else if ((LOW) == (HIGH) - 2) {\ + if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\ + (AUX_ARR)[LOW] = (ARR)[LOW];\ + (ARR)[LOW] = (ARR)[(HIGH) - 1];\ + (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\ + }\ + return;\ + }\ +\ + ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\ +\ + SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\ + SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\ +\ + ut_sort_low77 = (LOW);\ + ut_sort_high77 = ut_sort_mid77;\ +\ + for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\ +\ + if (ut_sort_low77 >= ut_sort_mid77) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else if (ut_sort_high77 >= (HIGH)) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + } else if (CMP_FUN((ARR)[ut_sort_low77],\ + (ARR)[ut_sort_high77]) > 0) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + }\ + }\ +\ + memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\ + ((HIGH) - (LOW)) * sizeof *(ARR));\ +}\ + + +#endif + diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h new file mode 100644 index 00000000..17fbd91b --- /dev/null +++ b/storage/innobase/include/ut0stage.h @@ -0,0 +1,499 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ut/ut0stage.h +Supplementary code to performance schema stage instrumentation. + +Created Nov 12, 2014 Vasil Dimov +*******************************************************/ + +#ifndef ut0stage_h +#define ut0stage_h + +#include <algorithm> +#include <math.h> + +#include "my_global.h" /* needed for headers from mysql/psi/ */ + +#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */ +#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */ + +#include "dict0mem.h" /* dict_index_t */ +#include "row0log.h" /* row_log_estimate_work() */ +#include "srv0srv.h" /* ut_stage_alter_t */ + +#ifdef HAVE_PSI_STAGE_INTERFACE + +/** Class used to report ALTER TABLE progress via performance_schema. +The only user of this class is the ALTER TABLE code and it calls the methods +in the following order +constructor +begin_phase_read_pk() + multiple times: + n_pk_recs_inc() // once per record read + inc() // once per page read +end_phase_read_pk() +if any new indexes are being added, for each one: + begin_phase_sort() + multiple times: + inc() // once per record sorted + begin_phase_insert() + multiple times: + inc() // once per record inserted + being_phase_log_index() + multiple times: + inc() // once per log-block applied +begin_phase_log_table() + multiple times: + inc() // once per log-block applied +begin_phase_end() +destructor + +This class knows the specifics of each phase and tries to increment the +progress in an even manner across the entire ALTER TABLE lifetime. */ +class ut_stage_alter_t { +public: + /** Constructor. + @param[in] pk primary key of the old table */ + explicit + ut_stage_alter_t( + const dict_index_t* pk) + : + m_progress(NULL), + m_pk(pk), + m_n_pk_recs(0), + m_n_pk_pages(0), + m_n_recs_processed(0), + m_cur_phase(NOT_STARTED) + { + } + + /** Destructor. */ + ~ut_stage_alter_t(); + + /** Flag an ALTER TABLE start (read primary key phase). + @param[in] n_sort_indexes number of indexes that will be sorted + during ALTER TABLE, used for estimating the total work to be done */ + void + begin_phase_read_pk( + ulint n_sort_indexes); + + /** Increment the number of records in PK (table) with 1. + This is used to get more accurate estimate about the number of + records per page which is needed because some phases work on + per-page basis while some work on per-record basis and we want + to get the progress as even as possible. */ + void + n_pk_recs_inc(); + + /** Flag either one record or one page processed, depending on the + current phase. + @param[in] inc_val flag this many units processed at once */ + void + inc( + ulint inc_val = 1); + + /** Flag the end of reading of the primary key. + Here we know the exact number of pages and records and calculate + the number of records per page and refresh the estimate. */ + void + end_phase_read_pk(); + + /** Flag the beginning of the sort phase. + @param[in] sort_multi_factor since merge sort processes + one page more than once we only update the estimate once per this + many pages processed. */ + void + begin_phase_sort( + double sort_multi_factor); + + /** Flag the beginning of the insert phase. */ + void + begin_phase_insert(); + + /** Flag the beginning of the log index phase. */ + void + begin_phase_log_index(); + + /** Flag the beginning of the log table phase. */ + void + begin_phase_log_table(); + + /** Flag the beginning of the end phase. */ + void + begin_phase_end(); + +private: + + /** Update the estimate of total work to be done. */ + void + reestimate(); + + /** Change the current phase. + @param[in] new_stage pointer to the new stage to change to */ + void + change_phase( + const PSI_stage_info* new_stage); + + /** Performance schema accounting object. */ + PSI_stage_progress* m_progress; + + /** Old table PK. Used for calculating the estimate. */ + const dict_index_t* m_pk; + + /** Number of records in the primary key (table), including delete + marked records. */ + ulint m_n_pk_recs; + + /** Number of leaf pages in the primary key. */ + ulint m_n_pk_pages; + + /** Estimated number of records per page in the primary key. */ + double m_n_recs_per_page; + + /** Number of indexes that are being added. */ + ulint m_n_sort_indexes; + + /** During the sort phase, increment the counter once per this + many pages processed. This is because sort processes one page more + than once. */ + ulint m_sort_multi_factor; + + /** Number of records processed during sort & insert phases. We + need to increment the counter only once page, or once per + recs-per-page records. */ + ulint m_n_recs_processed; + + /** Current phase. */ + enum { + NOT_STARTED = 0, + READ_PK = 1, + SORT = 2, + INSERT = 3, + /* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h + LOG_INDEX = 5, + LOG_TABLE = 6, */ + LOG_INNODB_INDEX = 5, + LOG_INNODB_TABLE = 6, + END = 7, + } m_cur_phase; +}; + +/** Destructor. */ +inline +ut_stage_alter_t::~ut_stage_alter_t() +{ + if (m_progress == NULL) { + return; + } + + /* Set completed = estimated before we quit. */ + mysql_stage_set_work_completed( + m_progress, + mysql_stage_get_work_estimated(m_progress)); + + mysql_end_stage(); +} + +/** Flag an ALTER TABLE start (read primary key phase). +@param[in] n_sort_indexes number of indexes that will be sorted +during ALTER TABLE, used for estimating the total work to be done */ +inline +void +ut_stage_alter_t::begin_phase_read_pk( + ulint n_sort_indexes) +{ + m_n_sort_indexes = n_sort_indexes; + + m_cur_phase = READ_PK; + + m_progress = mysql_set_stage( + srv_stage_alter_table_read_pk_internal_sort.m_key); + + mysql_stage_set_work_completed(m_progress, 0); + reestimate(); +} + +/** Increment the number of records in PK (table) with 1. +This is used to get more accurate estimate about the number of +records per page which is needed because some phases work on +per-page basis while some work on per-record basis and we want +to get the progress as even as possible. */ +inline +void +ut_stage_alter_t::n_pk_recs_inc() +{ + m_n_pk_recs++; +} + +/** Flag either one record or one page processed, depending on the +current phase. */ +inline +void +ut_stage_alter_t::inc(ulint inc_val) +{ + if (m_progress == NULL) { + return; + } + + ulint multi_factor = 1; + bool should_proceed = true; + + switch (m_cur_phase) { + case NOT_STARTED: + ut_error; + case READ_PK: + m_n_pk_pages++; + ut_ad(inc_val == 1); + /* Overall the read pk phase will read all the pages from the + PK and will do work, proportional to the number of added + indexes, thus when this is called once per read page we + increment with 1 + m_n_sort_indexes */ + inc_val = 1 + m_n_sort_indexes; + break; + case SORT: + multi_factor = m_sort_multi_factor; + /* fall through */ + case INSERT: { + /* Increment the progress every nth record. During + sort and insert phases, this method is called once per + record processed. We need fractional point numbers here + because "records per page" is such a number naturally and + to avoid rounding skew we want, for example: if there are + (double) N records per page, then the work_completed + should be incremented on the inc() calls round(k*N), + for k=1,2,3... */ + const double every_nth = m_n_recs_per_page * + static_cast<double>(multi_factor); + + const ulint k = static_cast<ulint>( + round(static_cast<double>(m_n_recs_processed) / + every_nth)); + + const ulint nth = static_cast<ulint>( + round(static_cast<double>(k) * every_nth)); + + should_proceed = m_n_recs_processed == nth; + + m_n_recs_processed++; + + break; + } + /* JAN: TODO: MySQL 5.7 + case LOG_INDEX: + break; + case LOG_TABLE: + break; */ + case LOG_INNODB_INDEX: + case LOG_INNODB_TABLE: + break; + case END: + break; + } + + if (should_proceed) { + mysql_stage_inc_work_completed(m_progress, inc_val); + reestimate(); + } +} + +/** Flag the end of reading of the primary key. +Here we know the exact number of pages and records and calculate +the number of records per page and refresh the estimate. */ +inline +void +ut_stage_alter_t::end_phase_read_pk() +{ + reestimate(); + + if (m_n_pk_pages == 0) { + /* The number of pages in the PK could be 0 if the tree is + empty. In this case we set m_n_recs_per_page to 1 to avoid + division by zero later. */ + m_n_recs_per_page = 1.0; + } else { + m_n_recs_per_page = std::max( + static_cast<double>(m_n_pk_recs) + / static_cast<double>(m_n_pk_pages), + 1.0); + } +} + +/** Flag the beginning of the sort phase. +@param[in] sort_multi_factor since merge sort processes +one page more than once we only update the estimate once per this +many pages processed. */ +inline +void +ut_stage_alter_t::begin_phase_sort( + double sort_multi_factor) +{ + if (sort_multi_factor <= 1.0) { + m_sort_multi_factor = 1; + } else { + m_sort_multi_factor = static_cast<ulint>( + round(sort_multi_factor)); + } + + change_phase(&srv_stage_alter_table_merge_sort); +} + +/** Flag the beginning of the insert phase. */ +inline +void +ut_stage_alter_t::begin_phase_insert() +{ + change_phase(&srv_stage_alter_table_insert); +} + +/** Flag the beginning of the log index phase. */ +inline +void +ut_stage_alter_t::begin_phase_log_index() +{ + change_phase(&srv_stage_alter_table_log_index); +} + +/** Flag the beginning of the log table phase. */ +inline +void +ut_stage_alter_t::begin_phase_log_table() +{ + change_phase(&srv_stage_alter_table_log_table); +} + +/** Flag the beginning of the end phase. */ +inline +void +ut_stage_alter_t::begin_phase_end() +{ + change_phase(&srv_stage_alter_table_end); +} + +/** Update the estimate of total work to be done. */ +inline +void +ut_stage_alter_t::reestimate() +{ + if (m_progress == NULL) { + return; + } + + /* During the log table phase we calculate the estimate as + work done so far + log size remaining. */ + if (m_cur_phase == LOG_INNODB_TABLE) { + mysql_stage_set_work_estimated( + m_progress, + mysql_stage_get_work_completed(m_progress) + + row_log_estimate_work(m_pk)); + return; + } + + /* During the other phases we use a formula, regardless of + how much work has been done so far. */ + + /* For number of pages in the PK - if the PK has not been + read yet, use stat_n_leaf_pages (approximate), otherwise + use the exact number we gathered. */ + const ulint n_pk_pages + = m_cur_phase != READ_PK + ? m_n_pk_pages + : m_pk->stat_n_leaf_pages; + + ulonglong estimate __attribute__((unused)) + = n_pk_pages + * (1 /* read PK */ + + m_n_sort_indexes /* row_merge_buf_sort() inside the + read PK per created index */ + + m_n_sort_indexes * 2 /* sort & insert per created index */) + + row_log_estimate_work(m_pk); + + /* Prevent estimate < completed */ + estimate = std::max(estimate, + mysql_stage_get_work_completed(m_progress)); + + mysql_stage_set_work_estimated(m_progress, estimate); +} + +/** Change the current phase. +@param[in] new_stage pointer to the new stage to change to */ +inline +void +ut_stage_alter_t::change_phase( + const PSI_stage_info* new_stage) +{ + if (m_progress == NULL) { + return; + } + + if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) { + m_cur_phase = READ_PK; + } else if (new_stage == &srv_stage_alter_table_merge_sort) { + m_cur_phase = SORT; + } else if (new_stage == &srv_stage_alter_table_insert) { + m_cur_phase = INSERT; + /* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */ + } else if (new_stage == &srv_stage_alter_table_log_index) { + m_cur_phase = LOG_INNODB_INDEX; + } else if (new_stage == &srv_stage_alter_table_log_table) { + m_cur_phase = LOG_INNODB_TABLE; + } else if (new_stage == &srv_stage_alter_table_end) { + m_cur_phase = END; + } else { + ut_error; + } + + const ulonglong c = mysql_stage_get_work_completed(m_progress); + const ulonglong e = mysql_stage_get_work_estimated(m_progress); + + m_progress = mysql_set_stage(new_stage->m_key); + + mysql_stage_set_work_completed(m_progress, c); + mysql_stage_set_work_estimated(m_progress, e); +} +#else /* HAVE_PSI_STAGE_INTERFACE */ + +class ut_stage_alter_t { +public: + explicit ut_stage_alter_t(const dict_index_t*) {} + + void begin_phase_read_pk(ulint) {} + + void n_pk_recs_inc() {} + + void inc() {} + void inc(ulint) {} + + void end_phase_read_pk() {} + + void begin_phase_sort(double) {} + + void begin_phase_insert() {} + + void begin_phase_log_index() {} + + void begin_phase_log_table() {} + + void begin_phase_end() {} +}; + +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +#endif /* ut0stage_h */ diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h new file mode 100644 index 00000000..9f11944c --- /dev/null +++ b/storage/innobase/include/ut0ut.h @@ -0,0 +1,453 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0ut.h +Various utilities + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0ut_h +#define ut0ut_h + +/* Do not include univ.i because univ.i includes this. */ + +#include <ostream> +#include <sstream> +#include <string.h> + +#ifndef UNIV_INNOCHECKSUM + +#include "db0err.h" + +#include <time.h> + +#ifndef MYSQL_SERVER +#include <ctype.h> +#endif /* MYSQL_SERVER */ + +#include <stdarg.h> + +#include <string> + +/** Index name prefix in fast index creation, as a string constant */ +#define TEMP_INDEX_PREFIX_STR "\377" + +#define ut_max std::max +#define ut_min std::min + +/** Calculate the minimum of two pairs. +@param[out] min_hi MSB of the minimum pair +@param[out] min_lo LSB of the minimum pair +@param[in] a_hi MSB of the first pair +@param[in] a_lo LSB of the first pair +@param[in] b_hi MSB of the second pair +@param[in] b_lo LSB of the second pair */ +UNIV_INLINE +void +ut_pair_min( + ulint* min_hi, + ulint* min_lo, + ulint a_hi, + ulint a_lo, + ulint b_hi, + ulint b_lo); +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b); /*!< in: ulint */ +/** Compare two pairs of integers. +@param[in] a_h more significant part of first pair +@param[in] a_l less significant part of first pair +@param[in] b_h more significant part of second pair +@param[in] b_l less significant part of second pair +@return comparison result of (a_h,a_l) and (b_h,b_l) +@retval -1 if (a_h,a_l) is less than (b_h,b_l) +@retval 0 if (a_h,a_l) is equal to (b_h,b_l) +@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */ +UNIV_INLINE +int +ut_pair_cmp( + ulint a_h, + ulint a_l, + ulint b_h, + ulint b_l) + MY_ATTRIBUTE((warn_unused_result)); + +/*************************************************************//** +Calculates fast the remainder of n/m when m is a power of two. +@param n in: numerator +@param m in: denominator, must be a power of two +@return the remainder of n/m */ +template <typename T> inline T ut_2pow_remainder(T n, T m){return n & (m - 1);} +/*************************************************************//** +Calculates the biggest multiple of m that is not bigger than n +when m is a power of two. In other words, rounds n down to m * k. +@param n in: number to round down +@param m in: alignment, must be a power of two +@return n rounded down to the biggest possible integer multiple of m */ +template <typename T> inline T ut_2pow_round(T n, T m) { return n & ~(m - 1); } +/********************************************************//** +Calculates the smallest multiple of m that is not smaller than n +when m is a power of two. In other words, rounds n up to m * k. +@param n in: number to round up +@param m in: alignment, must be a power of two +@return n rounded up to the smallest possible integer multiple of m */ +#define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1)) +template <typename T> inline T ut_calc_align(T n, T m) +{ return static_cast<T>(UT_CALC_ALIGN(n, m)); } + +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n); /*!< in: number */ +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n); /*!< in: number */ + +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +ulint +ut_time_ms(void); +/*============*/ +#endif /* !UNIV_INNOCHECKSUM */ + +/** Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. +@param b in: bits +@return number of bytes (octets) needed to represent b */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) >> 3) + +/** Determines if a number is zero or a power of two. +@param[in] n number +@return nonzero if n is zero or a power of two; zero otherwise */ +#define ut_is_2pow(n) (!((n) & ((n) - 1))) + +/** Functor that compares two C strings. Can be used as a comparator for +e.g. std::map that uses char* as keys. */ +struct ut_strcmp_functor +{ + bool operator()( + const char* a, + const char* b) const + { + return(strcmp(a, b) < 0); + } +}; + +/**********************************************************//** +Prints a timestamp to a file. */ +void +ut_print_timestamp( +/*===============*/ + FILE* file) /*!< in: file where to print */ + ATTRIBUTE_COLD __attribute__((nonnull)); + +#ifndef UNIV_INNOCHECKSUM + +/**********************************************************//** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +void +ut_sprintf_timestamp( +/*=================*/ + char* buf); /*!< in: buffer where to sprintf */ + +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +void +ut_print_buf( +/*=========*/ + FILE* file, /*!< in: file where to print */ + const void* buf, /*!< in: memory buffer */ + ulint len); /*!< in: length of the buffer */ + +/*************************************************************//** +Prints the contents of a memory buffer in hex. */ +void +ut_print_buf_hex( +/*=============*/ + std::ostream& o, /*!< in/out: output stream */ + const void* buf, /*!< in: memory buffer */ + ulint len) /*!< in: length of the buffer */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +void +ut_print_buf( +/*=========*/ + std::ostream& o, /*!< in/out: output stream */ + const void* buf, /*!< in: memory buffer */ + ulint len) /*!< in: length of the buffer */ + MY_ATTRIBUTE((nonnull)); + +/* Forward declaration of transaction handle */ +struct trx_t; + +/** Get a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. + @param [in] trx transaction (NULL=no quotes). + @param [in] name table name. + @retval String quoted as an SQL identifier. +*/ +std::string +ut_get_name( + const trx_t* trx, + const char* name); + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +void +ut_print_name( +/*==========*/ + FILE* ef, /*!< in: stream */ + const trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: table name to print */ +/** Format a table name, quoted as an SQL identifier. +If the name contains a slash '/', the result will contain two +identifiers separated by a period (.), as in SQL +database_name.table_name. +@see table_name_t +@param[in] name table or index name +@param[out] formatted formatted result, will be NUL-terminated +@param[in] formatted_size size of the buffer in bytes +@return pointer to 'formatted' */ +char* +ut_format_name( + const char* name, + char* formatted, + ulint formatted_size); + +/**********************************************************************//** +Catenate files. */ +void +ut_copy_file( +/*=========*/ + FILE* dest, /*!< in: output file */ + FILE* src); /*!< in: input file to be appended to output */ + +/*************************************************************//** +Convert an error number to a human readable text message. The +returned string is static and should not be freed or modified. +@return string, describing the error */ +const char* +ut_strerr( +/*======*/ + dberr_t num); /*!< in: error number */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#ifdef UNIV_PFS_MEMORY + +/** Extract the basename of a file without its extension. +For example, extract "foo0bar" out of "/path/to/foo0bar.cc". +@param[in] file file path, e.g. "/path/to/foo0bar.cc" +@param[out] base result, e.g. "foo0bar" +@param[in] base_size size of the output buffer 'base', if there +is not enough space, then the result will be truncated, but always +'\0'-terminated +@return number of characters that would have been printed if the size +were unlimited (not including the final ‘\0’) */ +size_t +ut_basename_noext( + const char* file, + char* base, + size_t base_size); + +#endif /* UNIV_PFS_MEMORY */ + +namespace ib { + +/** This is a wrapper class, used to print any unsigned integer type +in hexadecimal format. The main purpose of this data type is to +overload the global operator<<, so that we can print the given +wrapper value in hex. */ +struct hex { + explicit hex(uintmax_t t): m_val(t) {} + const uintmax_t m_val; +}; + +/** This is an overload of the global operator<< for the user defined type +ib::hex. The unsigned value held in the ib::hex wrapper class will be printed +into the given output stream in hexadecimal format. +@param[in,out] lhs the output stream into which rhs is written. +@param[in] rhs the object to be written into lhs. +@retval reference to the output stream. */ +inline +std::ostream& +operator<<( + std::ostream& lhs, + const hex& rhs) +{ + std::ios_base::fmtflags ff = lhs.flags(); + lhs << std::showbase << std::hex << rhs.m_val; + lhs.setf(ff); + return(lhs); +} + +/** The class logger is the base class of all the error log related classes. +It contains a std::ostringstream object. The main purpose of this class is +to forward operator<< to the underlying std::ostringstream object. Do not +use this class directly, instead use one of the derived classes. */ +class logger +{ +protected: + /* This class must not be used directly */ + ATTRIBUTE_COLD ATTRIBUTE_NOINLINE logger() {} +public: + template<typename T> ATTRIBUTE_COLD ATTRIBUTE_NOINLINE + logger& operator<<(const T& rhs) + { + m_oss << rhs; + return *this; + } + + /** Handle a fixed character string in the same way as a pointer to + an unknown-length character string, to reduce object code bloat. */ + template<size_t N> logger& operator<<(const char (&rhs)[N]) + { return *this << static_cast<const char*>(rhs); } + + /** Output an error code name */ + ATTRIBUTE_COLD logger& operator<<(dberr_t err); + + /** Append a string. + @param buf string buffer + @param size buffer size + @return the output stream */ + ATTRIBUTE_COLD __attribute__((noinline)) + std::ostream &write(const char *buf, std::streamsize size) + { + return m_oss.write(buf, size); + } + + std::ostream &write(const byte *buf, std::streamsize size) + { return write(reinterpret_cast<const char*>(buf), size); } + + std::ostringstream m_oss; +}; + +/** The class info is used to emit informational log messages. It is to be +used similar to std::cout. But the log messages will be emitted only when +the dtor is called. The preferred usage of this class is to make use of +unnamed temporaries as follows: + +info() << "The server started successfully."; + +In the above usage, the temporary object will be destroyed at the end of the +statement and hence the log message will be emitted at the end of the +statement. If a named object is created, then the log message will be emitted +only when it goes out of scope or destroyed. */ +class info : public logger { +public: + ATTRIBUTE_COLD + ~info(); +}; + +/** The class warn is used to emit warnings. Refer to the documentation of +class info for further details. */ +class warn : public logger { +public: + ATTRIBUTE_COLD + ~warn(); +}; + +/** The class error is used to emit error messages. Refer to the +documentation of class info for further details. */ +class error : public logger { +public: + ATTRIBUTE_COLD + ~error(); + /** Indicates that error::~error() was invoked. Can be used to + determine if error messages were logged during innodb code execution. + @return true if there were error messages, false otherwise. */ + static bool was_logged() { return logged; } + +private: + /** true if error::~error() was invoked, false otherwise */ + static bool logged; +}; + +/** The class fatal is used to emit an error message and stop the server +by crashing it. Use this class when MySQL server needs to be stopped +immediately. Refer to the documentation of class info for usage details. */ +class fatal : public logger { +public: + ATTRIBUTE_NORETURN + ~fatal(); +}; + +/** Emit an error message if the given predicate is true, otherwise emit a +warning message */ +class error_or_warn : public logger { +public: + ATTRIBUTE_COLD + error_or_warn(bool pred) + : m_error(pred) + {} + + ATTRIBUTE_COLD + ~error_or_warn(); +private: + const bool m_error; +}; + +/** Emit a fatal message if the given predicate is true, otherwise emit a +error message. */ +class fatal_or_error : public logger { +public: + ATTRIBUTE_COLD + fatal_or_error(bool pred) + : m_fatal(pred) + {} + + ATTRIBUTE_COLD + ~fatal_or_error(); +private: + const bool m_fatal; +}; + +} // namespace ib + +#include "ut0ut.ic" + +#endif + diff --git a/storage/innobase/include/ut0ut.ic b/storage/innobase/include/ut0ut.ic new file mode 100644 index 00000000..73feaf82 --- /dev/null +++ b/storage/innobase/include/ut0ut.ic @@ -0,0 +1,143 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0ut.ic +Various utilities + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#include <algorithm> + +/** Calculate the minimum of two pairs. +@param[out] min_hi MSB of the minimum pair +@param[out] min_lo LSB of the minimum pair +@param[in] a_hi MSB of the first pair +@param[in] a_lo LSB of the first pair +@param[in] b_hi MSB of the second pair +@param[in] b_lo LSB of the second pair */ +UNIV_INLINE +void +ut_pair_min( + ulint* min_hi, + ulint* min_lo, + ulint a_hi, + ulint a_lo, + ulint b_hi, + ulint b_lo) +{ + if (a_hi == b_hi) { + *min_hi = a_hi; + *min_lo = std::min(a_lo, b_lo); + } else if (a_hi < b_hi) { + *min_hi = a_hi; + *min_lo = a_lo; + } else { + *min_hi = b_hi; + *min_lo = b_lo; + } +} + +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b) /*!< in: ulint */ +{ + if (a < b) { + return(-1); + } else if (a == b) { + return(0); + } else { + return(1); + } +} + +/** Compare two pairs of integers. +@param[in] a_h more significant part of first pair +@param[in] a_l less significant part of first pair +@param[in] b_h more significant part of second pair +@param[in] b_l less significant part of second pair +@return comparison result of (a_h,a_l) and (b_h,b_l) +@retval -1 if (a_h,a_l) is less than (b_h,b_l) +@retval 0 if (a_h,a_l) is equal to (b_h,b_l) +@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */ +UNIV_INLINE +int +ut_pair_cmp( + ulint a_h, + ulint a_l, + ulint b_h, + ulint b_l) +{ + if (a_h < b_h) { + return(-1); + } + if (a_h > b_h) { + return(1); + } + return(ut_ulint_cmp(a_l, b_l)); +} + +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n) /*!< in: number != 0 */ +{ + ulint res; + + res = 0; + + ut_ad(n > 0); + + n = n - 1; + + for (;;) { + n = n / 2; + + if (n == 0) { + break; + } + + res++; + } + + return(res + 1); +} + +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n) /*!< in: number */ +{ + return((ulint) 1 << n); +} diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h new file mode 100644 index 00000000..cfdaee60 --- /dev/null +++ b/storage/innobase/include/ut0vec.h @@ -0,0 +1,285 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.h +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#ifndef IB_VECTOR_H +#define IB_VECTOR_H + +#include "mem0mem.h" + +struct ib_alloc_t; +struct ib_vector_t; + +typedef void* (*ib_mem_alloc_t)( + /* out: Pointer to allocated memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + ulint size); /* in: Number of bytes to allocate */ + +typedef void (*ib_mem_free_t)( + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + void* ptr); /* in: Memory to free */ + +typedef void* (*ib_mem_resize_t)( + /* out: Pointer to resized memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator */ + void* ptr, /* in: Memory to resize */ + ulint old_size, /* in: Old memory size in bytes */ + ulint new_size); /* in: New size in bytes */ + +typedef int (*ib_compare_t)(const void*, const void*); + +/* An automatically resizing vector datatype with the following properties: + + -All memory allocation is done through an allocator, which is responsible for +freeing it when done with the vector. +*/ + +/* This is useful shorthand for elements of type void* */ +#define ib_vector_getp(v, n) (*(void**) ib_vector_get(v, n)) +#define ib_vector_getp_const(v, n) (*(void**) ib_vector_get_const(v, n)) + +#define ib_vector_allocator(v) (v->allocator) + +/******************************************************************** +Create a new vector with the given initial size. */ +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + ib_alloc_t* alloc, /* in: Allocator */ + /* in: size of the data item */ + ulint sizeof_value, + ulint size); /* in: initial size */ + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Push a new element to the vector, increasing its size if necessary, +if elem is not NULL then elem is copied to the vector.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer the "new" element */ + ib_vector_t* vec, /* in/out: vector */ + const void* elem); /* in: data element */ + +/******************************************************************** +Pop the last element from the vector.*/ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec); /* in/out: vector */ + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem); /*!< in: value to remove */ + +/******************************************************************** +Get the number of elements in the vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Increase the size of the vector. */ +void +ib_vector_resize( +/*=============*/ + /* out: number of elements in vector */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec); /*!< in: vector */ + +/****************************************************************//** +Get the n'th element. +@return n'th element */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n); /*!< in: element index to get */ + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n); /* in: element index to get */ +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec); /*!< in: vector */ +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem); /*!< in: data element */ + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: pointer to last element */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: pointer to last element */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + ib_vector_t* vec, /* in/out: vector */ + ib_compare_t compare); /* in: the comparator to use for sort */ + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* ptr); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + /* out: pointer to allocated memory */ + ib_alloc_t* allocator, /* in: allocator */ + ulint size); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + /* out: pointer to reallocated + memory */ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size); /* in: new size in bytes */ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + /* out: heap allocator instance */ + mem_heap_t* heap); /* in: heap to use */ + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ + +/* Allocator used by ib_vector_t. */ +struct ib_alloc_t { + ib_mem_alloc_t mem_malloc; /* For allocating memory */ + ib_mem_free_t mem_release; /* For freeing memory */ + ib_mem_resize_t mem_resize; /* For resizing memory */ + void* arg; /* Currently if not NULL then it + points to the heap instance */ +}; + +/* See comment at beginning of file. */ +struct ib_vector_t { + ib_alloc_t* allocator; /* Allocator, because one size + doesn't fit all */ + void* data; /* data elements */ + ulint used; /* number of elements currently used */ + ulint total; /* number of elements allocated */ + /* Size of a data item */ + ulint sizeof_value; +}; + +#include "ut0vec.ic" + +#endif /* IB_VECTOR_H */ diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic new file mode 100644 index 00000000..531f0f22 --- /dev/null +++ b/storage/innobase/include/ut0vec.ic @@ -0,0 +1,348 @@ +/***************************************************************************** + +Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.ic +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#define IB_VEC_OFFSET(v, i) (vec->sizeof_value * i) + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + ulint size) /* in: size in bytes */ +{ + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + return(mem_heap_alloc(heap, size)); +} + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* ptr UNIV_UNUSED) /* in: size in bytes */ +{ + /* We can't free individual elements. */ +} + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +We always assume new_size >= old_size, so the buffer won't overflow. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size) /* in: new size in bytes */ +{ + void* new_ptr; + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + ut_a(new_size >= old_size); + new_ptr = mem_heap_alloc(heap, new_size); + memcpy(new_ptr, old_ptr, old_size); + + return(new_ptr); +} + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + mem_heap_t* heap) /* in: heap to use */ +{ + ib_alloc_t* heap_alloc; + + heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc)); + + heap_alloc->arg = heap; + heap_alloc->mem_release = ib_heap_free; + heap_alloc->mem_malloc = ib_heap_malloc; + heap_alloc->mem_resize = ib_heap_resize; + + return(heap_alloc); +} + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */ +{ + mem_heap_free((mem_heap_t*) ib_ut_alloc->arg); +} + +/******************************************************************** +Get number of elements in vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector*/ + const ib_vector_t* vec) /* in: vector */ +{ + return(vec->used); +} + +/****************************************************************//** +Get n'th element. */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n) /*!< in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n) /* in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec) /*!< in: vector */ +{ + ut_a(vec->used > 0); + + return((byte*) ib_vector_get(vec, vec->used - 1)); +} + +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem) /*!< in: data element */ +{ + void* slot; + + ut_a(n < vec->used); + + slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n)); + memcpy(slot, elem, vec->sizeof_value); +} + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + vec->used = 0; +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get(vec, ib_vector_size(vec) - 1)); +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: void */ + const ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get_const(vec, ib_vector_size(vec) - 1)); +} + +/****************************************************************//** +Remove the last element from the vector. +@return last vector element */ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to element */ + ib_vector_t* vec) /* in: vector */ +{ + void* elem; + + ut_a(vec->used > 0); + + elem = ib_vector_last(vec); + --vec->used; + + return(elem); +} + +/******************************************************************** +Append an element to the vector, if elem != NULL then copy the data +from elem.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec, /* in: vector */ + const void* elem) /* in: element to add (can be NULL) */ +{ + void* last; + + if (vec->used >= vec->total) { + ib_vector_resize(vec); + } + + last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used); + +#ifdef UNIV_DEBUG + memset(last, 0, vec->sizeof_value); +#endif + + if (elem) { + memcpy(last, elem, vec->sizeof_value); + } + + ++vec->used; + + return(last); +} + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem) /*!< in: value to remove */ +{ + void* current = NULL; + void* next; + ulint i; + ulint old_used_count = vec->used; + + for (i = 0; i < vec->used; i++) { + current = ib_vector_get(vec, i); + + if (*(void**) current == elem) { + if (i == vec->used - 1) { + return(ib_vector_pop(vec)); + } + + next = ib_vector_get(vec, i + 1); + memmove(current, next, vec->sizeof_value + * (vec->used - i - 1)); + --vec->used; + break; + } + } + + return((old_used_count != vec->used) ? current : NULL); +} + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + /* out: void */ + ib_vector_t* vec, /* in: vector */ + ib_compare_t compare)/* in: the comparator to use for sort */ +{ + qsort(vec->data, vec->used, vec->sizeof_value, compare); +} + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec) /* in, own: vector */ +{ + /* Currently we only support one type of allocator - heap, + when the heap is freed all the elements are freed too. */ + + /* Only the heap allocator uses the arg field. */ + ut_ad(vec->allocator->arg != NULL); + + mem_heap_free((mem_heap_t*) vec->allocator->arg); +} + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec) /*!< in: vector */ +{ + return(ib_vector_size(vec) == 0); +} diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h new file mode 100644 index 00000000..34762298 --- /dev/null +++ b/storage/innobase/include/ut0wqueue.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0wqueue.h +A work queue + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A Work queue. Threads can add work items to the queue and other threads can +wait for work items to be available and take them off the queue for +processing. +************************************************************************/ + +#ifndef IB_WORK_QUEUE_H +#define IB_WORK_QUEUE_H + +#include "ut0list.h" +#include "mem0mem.h" + +// Forward declaration +struct ib_list_t; + +/** Work queue */ +struct ib_wqueue_t +{ + /** Mutex protecting everything */ + ib_mutex_t mutex; + /** Work item list */ + ib_list_t* items; +}; + +/****************************************************************//** +Create a new work queue. +@return work queue */ +ib_wqueue_t* +ib_wqueue_create(); +/*===============*/ + +/****************************************************************//** +Free a work queue. */ +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq); /*!< in: work queue */ + +/** Add a work item to the queue. +@param[in,out] wq work queue +@param[in] item work item +@param[in,out] heap memory heap to use for allocating list node +@param[in] wq_locked work queue mutex locked */ +void +ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, + bool wq_locked = false); + +/** Check if queue is empty. +@param wq wait queue +@return whether the queue is empty */ +bool ib_wqueue_is_empty(ib_wqueue_t* wq); + +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*<! in: work queue */ +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq); /*<! in: work queue */ + +#endif /* IB_WORK_QUEUE_H */ |