summaryrefslogtreecommitdiffstats
path: root/storage/innobase/include
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 12:24:36 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 12:24:36 +0000
commit06eaf7232e9a920468c0f8d74dcf2fe8b555501c (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/include
parentInitial commit. (diff)
downloadmariadb-06eaf7232e9a920468c0f8d74dcf2fe8b555501c.tar.xz
mariadb-06eaf7232e9a920468c0f8d74dcf2fe8b555501c.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/include')
-rw-r--r--storage/innobase/include/btr0btr.h543
-rw-r--r--storage/innobase/include/btr0btr.inl111
-rw-r--r--storage/innobase/include/btr0bulk.h371
-rw-r--r--storage/innobase/include/btr0cur.h855
-rw-r--r--storage/innobase/include/btr0cur.inl170
-rw-r--r--storage/innobase/include/btr0defragment.h65
-rw-r--r--storage/innobase/include/btr0pcur.h459
-rw-r--r--storage/innobase/include/btr0pcur.inl372
-rw-r--r--storage/innobase/include/btr0sea.h403
-rw-r--r--storage/innobase/include/btr0sea.inl117
-rw-r--r--storage/innobase/include/btr0types.h154
-rw-r--r--storage/innobase/include/buf0block_hint.h76
-rw-r--r--storage/innobase/include/buf0buddy.h91
-rw-r--r--storage/innobase/include/buf0buf.h2190
-rw-r--r--storage/innobase/include/buf0buf.inl132
-rw-r--r--storage/innobase/include/buf0checksum.h57
-rw-r--r--storage/innobase/include/buf0dblwr.h164
-rw-r--r--storage/innobase/include/buf0dump.h44
-rw-r--r--storage/innobase/include/buf0flu.h125
-rw-r--r--storage/innobase/include/buf0lru.h193
-rw-r--r--storage/innobase/include/buf0rea.h120
-rw-r--r--storage/innobase/include/buf0types.h235
-rw-r--r--storage/innobase/include/data0data.h704
-rw-r--r--storage/innobase/include/data0data.inl633
-rw-r--r--storage/innobase/include/data0type.h591
-rw-r--r--storage/innobase/include/data0type.inl487
-rw-r--r--storage/innobase/include/data0types.h36
-rw-r--r--storage/innobase/include/db0err.h170
-rw-r--r--storage/innobase/include/dict0boot.h297
-rw-r--r--storage/innobase/include/dict0crea.h277
-rw-r--r--storage/innobase/include/dict0crea.inl136
-rw-r--r--storage/innobase/include/dict0defrag_bg.h101
-rw-r--r--storage/innobase/include/dict0dict.h1744
-rw-r--r--storage/innobase/include/dict0dict.inl1217
-rw-r--r--storage/innobase/include/dict0load.h220
-rw-r--r--storage/innobase/include/dict0mem.h2649
-rw-r--r--storage/innobase/include/dict0mem.inl68
-rw-r--r--storage/innobase/include/dict0pagecompress.h61
-rw-r--r--storage/innobase/include/dict0pagecompress.inl81
-rw-r--r--storage/innobase/include/dict0stats.h238
-rw-r--r--storage/innobase/include/dict0stats.inl219
-rw-r--r--storage/innobase/include/dict0stats_bg.h59
-rw-r--r--storage/innobase/include/dict0types.h176
-rw-r--r--storage/innobase/include/dyn0buf.h442
-rw-r--r--storage/innobase/include/dyn0types.h39
-rw-r--r--storage/innobase/include/eval0eval.h109
-rw-r--r--storage/innobase/include/eval0eval.inl254
-rw-r--r--storage/innobase/include/eval0proc.h94
-rw-r--r--storage/innobase/include/eval0proc.inl88
-rw-r--r--storage/innobase/include/fil0crypt.h396
-rw-r--r--storage/innobase/include/fil0crypt.inl81
-rw-r--r--storage/innobase/include/fil0fil.h1823
-rw-r--r--storage/innobase/include/fil0pagecompress.h57
-rw-r--r--storage/innobase/include/fsp0file.h509
-rw-r--r--storage/innobase/include/fsp0fsp.h762
-rw-r--r--storage/innobase/include/fsp0space.h209
-rw-r--r--storage/innobase/include/fsp0sysspace.h278
-rw-r--r--storage/innobase/include/fsp0types.h404
-rw-r--r--storage/innobase/include/fts0ast.h340
-rw-r--r--storage/innobase/include/fts0blex.h702
-rw-r--r--storage/innobase/include/fts0fts.h947
-rw-r--r--storage/innobase/include/fts0opt.h39
-rw-r--r--storage/innobase/include/fts0pars.h72
-rw-r--r--storage/innobase/include/fts0plugin.h50
-rw-r--r--storage/innobase/include/fts0priv.h485
-rw-r--r--storage/innobase/include/fts0priv.inl121
-rw-r--r--storage/innobase/include/fts0tlex.h702
-rw-r--r--storage/innobase/include/fts0tokenize.h189
-rw-r--r--storage/innobase/include/fts0types.h354
-rw-r--r--storage/innobase/include/fts0types.inl231
-rw-r--r--storage/innobase/include/fts0vlc.h124
-rw-r--r--storage/innobase/include/fut0lst.h156
-rw-r--r--storage/innobase/include/gis0geo.h122
-rw-r--r--storage/innobase/include/gis0rtree.h513
-rw-r--r--storage/innobase/include/gis0rtree.inl245
-rw-r--r--storage/innobase/include/gis0type.h146
-rw-r--r--storage/innobase/include/ha0ha.h60
-rw-r--r--storage/innobase/include/ha0ha.inl154
-rw-r--r--storage/innobase/include/ha0storage.h137
-rw-r--r--storage/innobase/include/ha0storage.inl142
-rw-r--r--storage/innobase/include/ha_prototypes.h476
-rw-r--r--storage/innobase/include/handler0alter.h108
-rw-r--r--storage/innobase/include/hash0hash.h190
-rw-r--r--storage/innobase/include/ibuf0ibuf.h436
-rw-r--r--storage/innobase/include/ibuf0ibuf.inl282
-rw-r--r--storage/innobase/include/lock0iter.h66
-rw-r--r--storage/innobase/include/lock0lock.h1271
-rw-r--r--storage/innobase/include/lock0lock.inl78
-rw-r--r--storage/innobase/include/lock0prdt.h192
-rw-r--r--storage/innobase/include/lock0priv.h582
-rw-r--r--storage/innobase/include/lock0priv.inl255
-rw-r--r--storage/innobase/include/lock0types.h251
-rw-r--r--storage/innobase/include/log0crypt.h115
-rw-r--r--storage/innobase/include/log0log.h529
-rw-r--r--storage/innobase/include/log0recv.h491
-rw-r--r--storage/innobase/include/log0types.h38
-rw-r--r--storage/innobase/include/mach0data.h375
-rw-r--r--storage/innobase/include/mach0data.inl837
-rw-r--r--storage/innobase/include/mariadb_stats.h119
-rw-r--r--storage/innobase/include/mem0mem.h345
-rw-r--r--storage/innobase/include/mem0mem.inl468
-rw-r--r--storage/innobase/include/mtr0log.h637
-rw-r--r--storage/innobase/include/mtr0mtr.h780
-rw-r--r--storage/innobase/include/mtr0types.h347
-rw-r--r--storage/innobase/include/os0file.h1188
-rw-r--r--storage/innobase/include/os0file.inl412
-rw-r--r--storage/innobase/include/page0cur.h303
-rw-r--r--storage/innobase/include/page0cur.inl203
-rw-r--r--storage/innobase/include/page0page.h1101
-rw-r--r--storage/innobase/include/page0page.inl550
-rw-r--r--storage/innobase/include/page0types.h188
-rw-r--r--storage/innobase/include/page0zip.h383
-rw-r--r--storage/innobase/include/page0zip.inl317
-rw-r--r--storage/innobase/include/pars0grm.h151
-rw-r--r--storage/innobase/include/pars0opt.h68
-rw-r--r--storage/innobase/include/pars0pars.h695
-rw-r--r--storage/innobase/include/pars0sym.h243
-rw-r--r--storage/innobase/include/pars0types.h50
-rw-r--r--storage/innobase/include/que0que.h314
-rw-r--r--storage/innobase/include/que0que.inl245
-rw-r--r--storage/innobase/include/que0types.h97
-rw-r--r--storage/innobase/include/read0types.h275
-rw-r--r--storage/innobase/include/rem0cmp.h286
-rw-r--r--storage/innobase/include/rem0rec.h1276
-rw-r--r--storage/innobase/include/rem0rec.inl1134
-rw-r--r--storage/innobase/include/rem0types.h78
-rw-r--r--storage/innobase/include/row0ext.h101
-rw-r--r--storage/innobase/include/row0ext.inl87
-rw-r--r--storage/innobase/include/row0ftsort.h268
-rw-r--r--storage/innobase/include/row0import.h67
-rw-r--r--storage/innobase/include/row0ins.h224
-rw-r--r--storage/innobase/include/row0log.h239
-rw-r--r--storage/innobase/include/row0merge.h496
-rw-r--r--storage/innobase/include/row0mysql.h841
-rw-r--r--storage/innobase/include/row0purge.h149
-rw-r--r--storage/innobase/include/row0quiesce.h67
-rw-r--r--storage/innobase/include/row0row.h431
-rw-r--r--storage/innobase/include/row0row.inl221
-rw-r--r--storage/innobase/include/row0sel.h457
-rw-r--r--storage/innobase/include/row0types.h54
-rw-r--r--storage/innobase/include/row0uins.h50
-rw-r--r--storage/innobase/include/row0umod.h46
-rw-r--r--storage/innobase/include/row0undo.h114
-rw-r--r--storage/innobase/include/row0upd.h559
-rw-r--r--storage/innobase/include/row0upd.inl153
-rw-r--r--storage/innobase/include/row0vers.h143
-rw-r--r--storage/innobase/include/rw_lock.h138
-rw-r--r--storage/innobase/include/small_vector.h100
-rw-r--r--storage/innobase/include/srv0mon.h846
-rw-r--r--storage/innobase/include/srv0mon.inl113
-rw-r--r--storage/innobase/include/srv0srv.h715
-rw-r--r--storage/innobase/include/srv0start.h124
-rw-r--r--storage/innobase/include/srw_lock.h554
-rw-r--r--storage/innobase/include/sux_lock.h472
-rw-r--r--storage/innobase/include/transactional_lock_guard.h174
-rw-r--r--storage/innobase/include/trx0i_s.h277
-rw-r--r--storage/innobase/include/trx0purge.h427
-rw-r--r--storage/innobase/include/trx0rec.h299
-rw-r--r--storage/innobase/include/trx0roll.h168
-rw-r--r--storage/innobase/include/trx0rseg.h301
-rw-r--r--storage/innobase/include/trx0sys.h1274
-rw-r--r--storage/innobase/include/trx0trx.h1268
-rw-r--r--storage/innobase/include/trx0trx.inl86
-rw-r--r--storage/innobase/include/trx0types.h131
-rw-r--r--storage/innobase/include/trx0undo.h514
-rw-r--r--storage/innobase/include/trx0undo.inl129
-rw-r--r--storage/innobase/include/trx0xa.h61
-rw-r--r--storage/innobase/include/univ.i503
-rw-r--r--storage/innobase/include/ut0byte.h107
-rw-r--r--storage/innobase/include/ut0byte.inl90
-rw-r--r--storage/innobase/include/ut0counter.h123
-rw-r--r--storage/innobase/include/ut0dbg.h179
-rw-r--r--storage/innobase/include/ut0list.h146
-rw-r--r--storage/innobase/include/ut0list.inl80
-rw-r--r--storage/innobase/include/ut0lst.h563
-rw-r--r--storage/innobase/include/ut0mem.h76
-rw-r--r--storage/innobase/include/ut0mem.inl246
-rw-r--r--storage/innobase/include/ut0new.h1099
-rw-r--r--storage/innobase/include/ut0pool.h365
-rw-r--r--storage/innobase/include/ut0rbt.h254
-rw-r--r--storage/innobase/include/ut0rnd.h128
-rw-r--r--storage/innobase/include/ut0rnd.inl128
-rw-r--r--storage/innobase/include/ut0sort.h104
-rw-r--r--storage/innobase/include/ut0stage.h499
-rw-r--r--storage/innobase/include/ut0ut.h444
-rw-r--r--storage/innobase/include/ut0ut.inl143
-rw-r--r--storage/innobase/include/ut0vec.h285
-rw-r--r--storage/innobase/include/ut0vec.inl348
-rw-r--r--storage/innobase/include/ut0wqueue.h86
189 files changed, 66546 insertions, 0 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
new file mode 100644
index 00000000..5a0401fa
--- /dev/null
+++ b/storage/innobase/include/btr0btr.h
@@ -0,0 +1,543 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "rem0types.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "gis0type.h"
+
+#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level
+ (not really a hard limit).
+ Used in debug assertions
+ in btr_page_set_level and
+ btr_page_get_level */
+
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define BTR_PAGE_MAX_REC_SIZE (srv_page_size / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g. fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS 100
+
+#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \
+ btr_latch_mode((latch_mode) & ~(BTR_INSERT \
+ | BTR_DELETE_MARK \
+ | BTR_RTREE_UNDO_INS \
+ | BTR_RTREE_DELETE_MARK \
+ | BTR_DELETE \
+ | BTR_IGNORE_SEC_UNIQUE \
+ | BTR_ALREADY_S_LATCHED \
+ | BTR_LATCH_FOR_INSERT \
+ | BTR_LATCH_FOR_DELETE))
+
+#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \
+ btr_latch_mode((latch_mode) \
+ & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE))
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+ const dict_index_t* index) /*!< in: index tree */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
+
+/** Get an index page and declare its latching order level.
+@param[in] index index tree
+@param[in] page page number
+@param[in] mode latch mode
+@param[in] merge whether change buffer merge should be attempted
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return block */
+buf_block_t *btr_block_get(const dict_index_t &index,
+ uint32_t page, rw_lock_type_t mode, bool merge,
+ mtr_t *mtr, dberr_t *err= nullptr);
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page) /*!< in: index page */
+ MY_ATTRIBUTE((warn_unused_result));
+/** Read the B-tree or R-tree PAGE_LEVEL.
+@param page B-tree or R-tree page
+@return number of child page links to reach the leaf level
+@retval 0 for leaf pages */
+inline uint16_t btr_page_get_level(const page_t *page)
+{
+ uint16_t level= mach_read_from_2(my_assume_aligned<2>
+ (PAGE_HEADER + PAGE_LEVEL + page));
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+ return level;
+} MY_ATTRIBUTE((warn_unused_result))
+
+/** Read FIL_PAGE_NEXT.
+@param page buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_next(const page_t* page)
+{
+ return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+}
+
+/** Read FIL_PAGE_PREV.
+@param page buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_prev(const page_t* page)
+{
+ return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Create the root node for a new index tree.
+@param[in] type type of the index
+@param[in,out] space tablespace where created
+@param[in] index_id index id
+@param[in] index index, or NULL to create a system table
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return page number of the created root
+@retval FIL_NULL if did not succeed */
+uint32_t
+btr_create(
+ ulint type,
+ fil_space_t* space,
+ index_id_t index_id,
+ dict_index_t* index,
+ mtr_t* mtr,
+ dberr_t* err)
+ MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result));
+
+/** Free a persistent index tree if it exists.
+@param[in,out] space tablespce
+@param[in] page root page number
+@param[in] index_id PAGE_INDEX_ID contents
+@param[in,out] mtr mini-transaction */
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+ index_id_t index_id, mtr_t *mtr);
+
+/** Drop a temporary table
+@param table temporary table */
+void btr_drop_temporary_table(const dict_table_t &table);
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@return the last used AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in] table table containing an AUTO_INCREMENT column
+@param[in] col_no index of the AUTO_INCREMENT column
+@return the AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@param[in] autoinc the AUTO_INCREMENT value
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
+ MY_ATTRIBUTE((nonnull));
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out] root clustered index root page
+@param[in] index clustered index with instant ALTER TABLE
+@param[in,out] mtr mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
+
+ATTRIBUTE_COLD __attribute__((nonnull))
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in] index clustered index with instant ALTER TABLE
+@param[in] all whether to reset FIL_PAGE_TYPE as well
+@param[in,out] mtr mini-transaction */
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@param cursor page cursor
+@param mtr mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in] cursor insert position
+@return the first record to be moved to the right half page
+@retval NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor);
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in] cursor insert position
+@param[out] split_rec if split recommended, the first record
+ on the right half page, or
+ NULL if the to-be-inserted record
+ should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec);
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+dberr_t
+btr_insert_on_non_leaf_level(
+ ulint flags, /*!< in: undo logging and locking flags */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Set a child page pointer record as the predefined minimum record.
+@tparam has_prev whether the page is supposed to have a left sibling
+@param[in,out] rec leftmost record on a leftmost non-leaf page
+@param[in,out] block buffer pool block
+@param[in,out] mtr mini-transaction */
+template<bool has_prev= false>
+inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
+ mtr_t *mtr)
+{
+ ut_ad(block.page.frame == page_align(rec));
+ ut_ad(!page_is_leaf(block.page.frame));
+ ut_ad(has_prev == page_has_prev(block.page.frame));
+
+ rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
+
+ if (block.page.zip.data)
+ /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED
+ page. We are not modifying the compressed page frame at all. */
+ *rec|= REC_INFO_MIN_REC_FLAG;
+ else
+ mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG);
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out] mtr mini-transaction
+@param[in,out] cursor cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+ MY_ATTRIBUTE((nonnull,warn_unused_result));
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return error code
+@retval DB_FAIL if the tree could not be merged */
+dberr_t
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
+ or lift; the page must not be empty:
+ when deleting records, use btr_discard_page()
+ if the page would become empty */
+ bool adjust, /*!< in: whether the cursor position should be
+ adjusted even when compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+dberr_t
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index tree */
+ uint32_t hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr, /*!< in/out: mini-transaction
+ for the allocation */
+ mtr_t* init_mtr, /*!< in/out: mini-transaction
+ for x-latching and initializing
+ the page */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((warn_unused_result));
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out] block page to be emptied
+@param[in,out] page_zip compressed page frame, or NULL
+@param[in] index index of the page
+@param[in] level B-tree level of the page (0=leaf)
+@param[in,out] mtr mini-transaction */
+void
+btr_page_empty(
+ buf_block_t* block,
+ page_zip_des_t* page_zip,
+ dict_index_t* index,
+ ulint level,
+ mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull(1, 3, 5)));
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization). @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+ buf_block_t* block, /*!< in/out: page to be created */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr); /*!< in: mtr */
+
+/** Free an index page.
+@param[in,out] index index tree
+@param[in,out] block block to be freed
+@param[in,out] mtr mini-transaction
+@param[in] blob whether this is freeing a BLOB page
+@param[in] latched whether index->table->space->x_lock() was called */
+MY_ATTRIBUTE((nonnull))
+dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr,
+ bool blob= false, bool space_latched= false);
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ rw_lock_type_t mode, /*!< in: either RW_S_LATCH
+ or RW_X_LATCH */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err); /*!< out: error code */
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize_block(
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull, warn_unused_result));
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index) /*!< in: index tree */
+ MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width) /*!< in: print this many entries from start
+ and end */
+ MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error) /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+ MY_ATTRIBUTE((warn_unused_result));
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ const trx_t* trx) /*!< in: transaction or 0 */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Remove a page from the level list of pages.
+@param[in] block page to remove
+@param[in] index index tree
+@param[in,out] mtr mini-transaction */
+dberr_t btr_level_list_remove(const buf_block_t& block,
+ const dict_index_t& index, mtr_t* mtr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+buf_block_t*
+btr_lift_page_up(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
+ __attribute__((nonnull));
+
+#define BTR_N_LEAF_PAGES 1
+#define BTR_TOTAL_SIZE 2
+
+#include "btr0btr.inl"
+
+/****************************************************************
+Global variable controlling if scrubbing should be performed */
+extern my_bool srv_immediate_scrub_data_uncompressed;
diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl
new file mode 100644
index 00000000..9a9e39b6
--- /dev/null
+++ b/storage/innobase/include/btr0btr.inl
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+/** Set PAGE_LEVEL.
+@param[in,out] block buffer block
+@param[in] level page level
+@param[in,out] mtr mini-transaction */
+inline
+void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
+{
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+ constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
+ byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+ if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
+}
+
+/** Set FIL_PAGE_NEXT.
+@param[in,out] block buffer block
+@param[in] next number of successor page
+@param[in,out] mtr mini-transaction */
+inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
+{
+ constexpr uint16_t field= FIL_PAGE_NEXT;
+ byte *b= my_assume_aligned<4>(&block->page.frame[field]);
+ if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/** Set FIL_PAGE_PREV.
+@param[in,out] block buffer block
+@param[in] prev number of predecessor page
+@param[in,out] mtr mini-transaction */
+inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
+{
+ constexpr uint16_t field= FIL_PAGE_PREV;
+ byte *b= my_assume_aligned<4>(&block->page.frame[field]);
+ if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ /* The child address is in the last field */
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1, &len);
+
+ ut_ad(len == 4);
+
+ uint32_t page_no = mach_read_from_4(field);
+ ut_ad(page_no > 1);
+
+ return(page_no);
+}
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
new file mode 100644
index 00000000..9fcea86d
--- /dev/null
+++ b/storage/innobase/include/btr0bulk.h
@@ -0,0 +1,371 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0bulk.h
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*************************************************************************/
+
+#ifndef btr0bulk_h
+#define btr0bulk_h
+
+#include "dict0dict.h"
+#include "rem0types.h"
+#include "page0cur.h"
+
+#include <vector>
+
+/** Innodb B-tree index fill factor for bulk load. */
+extern uint innobase_fill_factor;
+
+/*
+The proper function call sequence of PageBulk is as below:
+-- PageBulk::init
+-- PageBulk::insert
+-- PageBulk::finish
+-- PageBulk::compress(COMPRESSED table only)
+-- PageBulk::pageSplit(COMPRESSED table only)
+-- PageBulk::commit
+*/
+
+class PageBulk
+{
+public:
+ /** Constructor
+ @param[in] index B-tree index
+ @param[in] page_no page number
+ @param[in] level page level
+ @param[in] trx_id transaction id */
+ PageBulk(
+ dict_index_t* index,
+ trx_id_t trx_id,
+ uint32_t page_no,
+ ulint level)
+ :
+ m_heap(NULL),
+ m_index(index),
+ m_mtr(),
+ m_trx_id(trx_id),
+ m_block(NULL),
+ m_page(NULL),
+ m_page_zip(NULL),
+ m_cur_rec(NULL),
+ m_page_no(page_no),
+ m_level(level),
+ m_is_comp(dict_table_is_comp(index->table)),
+ m_heap_top(NULL),
+ m_rec_no(0),
+ m_free_space(0),
+ m_reserved_space(0),
+#ifdef UNIV_DEBUG
+ m_total_data(0),
+#endif /* UNIV_DEBUG */
+ m_modify_clock(0),
+ m_err(DB_SUCCESS)
+ {
+ ut_ad(!dict_index_is_spatial(m_index));
+ ut_ad(!m_index->table->is_temporary());
+ }
+
+ /** Deconstructor */
+ ~PageBulk()
+ {
+ mem_heap_free(m_heap);
+ }
+
+ /** Initialize members and allocate page if needed and start mtr.
+ Note: must be called and only once right after constructor.
+ @return error code */
+ dberr_t init();
+
+ /** Insert a record in the page.
+ @param[in] rec record
+ @param[in] offsets record offsets */
+ inline void insert(const rec_t* rec, rec_offs* offsets);
+private:
+ /** Page format */
+ enum format { REDUNDANT, DYNAMIC, COMPRESSED };
+ /** Mark end of insertion to the page. Scan all records to set page
+ dirs, and set page header members.
+ @tparam format the page format */
+ template<format> inline void finishPage();
+ /** Insert a record in the page.
+ @tparam format the page format
+ @param[in,out] rec record
+ @param[in] offsets record offsets */
+ template<format> inline void insertPage(rec_t* rec, rec_offs* offsets);
+
+public:
+ /** Mark end of insertion to the page. Scan all records to set page
+ dirs, and set page header members. */
+ inline void finish();
+
+ /** @return whether finish() actually needs to do something */
+ inline bool needs_finish() const;
+
+ /** Commit mtr for a page
+ @param[in] success Flag whether all inserts succeed. */
+ void commit(bool success);
+
+ /** Compress if it is compressed table
+ @return true compress successfully or no need to compress
+ @return false compress failed. */
+ bool compress();
+
+ /** Check whether the record needs to be stored externally.
+ @return true
+ @return false */
+ bool needExt(const dtuple_t* tuple, ulint rec_size);
+
+ /** Store external record
+ @param[in] big_rec external recrod
+ @param[in] offsets record offsets
+ @return error code */
+ dberr_t storeExt(const big_rec_t* big_rec, rec_offs* offsets);
+
+ /** Get node pointer
+ @return node pointer */
+ dtuple_t* getNodePtr();
+
+ /** Get split rec in the page. We split a page in half when compresssion
+ fails, and the split rec should be copied to the new page.
+ @return split rec */
+ rec_t* getSplitRec();
+
+ /** Copy all records after split rec including itself.
+ @param[in] rec split rec */
+ void copyIn(rec_t* split_rec);
+
+ /** Remove all records after split rec including itself.
+ @param[in] rec split rec */
+ void copyOut(rec_t* split_rec);
+
+ /** Set next page
+ @param[in] next_page_no next page no */
+ inline void setNext(ulint next_page_no);
+
+ /** Set previous page
+ @param[in] prev_page_no previous page no */
+ inline void setPrev(ulint prev_page_no);
+
+ /** Release block by commiting mtr */
+ inline void release();
+
+ /** Start mtr and latch block */
+ inline void latch();
+
+ /** Check if required space is available in the page for the rec
+ to be inserted. We check fill factor & padding here.
+ @param[in] length required length
+ @return true if space is available */
+ inline bool isSpaceAvailable(ulint rec_size);
+
+ /** Get page no */
+ uint32_t getPageNo() const { return m_page_no; }
+
+ /** Get page level */
+ ulint getLevel()
+ {
+ return(m_level);
+ }
+
+ /** Get record no */
+ ulint getRecNo()
+ {
+ return(m_rec_no);
+ }
+
+ /** Get page */
+ page_t* getPage()
+ {
+ return(m_page);
+ }
+
+ /** Get page zip */
+ page_zip_des_t* getPageZip()
+ {
+ return(m_page_zip);
+ }
+
+ dberr_t getError()
+ {
+ return(m_err);
+ }
+
+ void set_modified() { m_mtr.set_modified(*m_block); }
+
+ /* Memory heap for internal allocation */
+ mem_heap_t* m_heap;
+
+private:
+ /** The index B-tree */
+ dict_index_t* m_index;
+
+ /** The mini-transaction */
+ mtr_t m_mtr;
+
+ /** The transaction id */
+ trx_id_t m_trx_id;
+
+ /** The buffer block */
+ buf_block_t* m_block;
+
+ /** The page */
+ page_t* m_page;
+
+ /** The page zip descriptor */
+ page_zip_des_t* m_page_zip;
+
+ /** The current rec, just before the next insert rec */
+ rec_t* m_cur_rec;
+
+ /** The page no */
+ uint32_t m_page_no;
+
+ /** The page level in B-tree */
+ ulint m_level;
+
+ /** Flag: is page in compact format */
+ const bool m_is_comp;
+
+ /** The heap top in page for next insert */
+ byte* m_heap_top;
+
+ /** User record no */
+ ulint m_rec_no;
+
+ /** The free space left in the page */
+ ulint m_free_space;
+
+ /** The reserved space for fill factor */
+ ulint m_reserved_space;
+
+ /** The padding space for compressed page */
+ ulint m_padding_space;
+
+#ifdef UNIV_DEBUG
+ /** Total data in the page */
+ ulint m_total_data;
+#endif /* UNIV_DEBUG */
+
+ /** The modify clock value of the buffer block
+ when the block is re-pinned */
+ ib_uint64_t m_modify_clock;
+
+ /** Operation result DB_SUCCESS or error code */
+ dberr_t m_err;
+};
+
+typedef std::vector<PageBulk*, ut_allocator<PageBulk*> >
+ page_bulk_vector;
+
+class BtrBulk
+{
+public:
+ /** Constructor
+ @param[in] index B-tree index
+ @param[in] trx transaction */
+ BtrBulk(
+ dict_index_t* index,
+ const trx_t* trx)
+ :
+ m_index(index),
+ m_trx(trx)
+ {
+ ut_ad(!dict_index_is_spatial(index));
+ }
+
+ /** Insert a tuple
+ @param[in] tuple tuple to insert.
+ @return error code */
+ dberr_t insert(dtuple_t* tuple)
+ {
+ return(insert(tuple, 0));
+ }
+
+ /** Btree bulk load finish. We commit the last page in each level
+ and copy the last page in top level to the root page of the index
+ if no error occurs.
+ @param[in] err whether bulk load was successful until now
+ @return error code */
+ dberr_t finish(dberr_t err);
+
+ /** Release all latches */
+ void release();
+
+ /** Re-latch all latches */
+ void latch();
+
+ table_name_t table_name() { return m_index->table->name; }
+
+private:
+ /** Insert a tuple to a page in a level
+ @param[in] tuple tuple to insert
+ @param[in] level B-tree level
+ @return error code */
+ dberr_t insert(dtuple_t* tuple, ulint level);
+
+ /** Split a page
+ @param[in] page_bulk page to split
+ @param[in] next_page_bulk next page
+ @return error code */
+ dberr_t pageSplit(PageBulk* page_bulk,
+ PageBulk* next_page_bulk);
+
+ /** Commit(finish) a page. We set next/prev page no, compress a page of
+ compressed table and split the page if compression fails, insert a node
+ pointer to father page if needed, and commit mini-transaction.
+ @param[in] page_bulk page to commit
+ @param[in] next_page_bulk next page
+ @param[in] insert_father flag whether need to insert node ptr
+ @return error code */
+ dberr_t pageCommit(PageBulk* page_bulk,
+ PageBulk* next_page_bulk,
+ bool insert_father);
+
+ /** Abort a page when an error occurs
+ @param[in] page_bulk page bulk object
+ Note: we should call pageAbort for a PageBulk object, which is not in
+ m_page_bulks after pageCommit, and we will commit or abort PageBulk
+ objects in function "finish". */
+ void pageAbort(PageBulk* page_bulk)
+ {
+ page_bulk->commit(false);
+ }
+
+ /** Log free check */
+ inline void logFreeCheck();
+
+private:
+ /** B-tree index */
+ dict_index_t*const m_index;
+
+ /** Transaction */
+ const trx_t*const m_trx;
+
+ /** Root page level */
+ ulint m_root_level;
+
+ /** Page cursor vector for all level */
+ page_bulk_vector m_page_bulks;
+};
+
+#endif
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
new file mode 100644
index 00000000..f6abc9f5
--- /dev/null
+++ b/storage/innobase/include/btr0cur.h
@@ -0,0 +1,855 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "rem0types.h"
+#include "gis0type.h"
+#include "my_base.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "srw_lock.h"
+#endif
+
+/** Mode flags for btr_cur operations; these can be ORed */
+enum {
+ /** do no undo logging */
+ BTR_NO_UNDO_LOG_FLAG = 1,
+ /** do no record lock checking */
+ BTR_NO_LOCKING_FLAG = 2,
+ /** sys fields will be found in the update vector or inserted
+ entry */
+ BTR_KEEP_SYS_FLAG = 4,
+
+ /** no rollback */
+ BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG,
+
+ /** btr_cur_pessimistic_update() must keep cursor position
+ when moving columns to big_rec */
+ BTR_KEEP_POS_FLAG = 8,
+ /** the caller is creating the index or wants to bypass the
+ index->info.online creation log */
+ BTR_CREATE_FLAG = 16,
+ /** the caller of btr_cur_optimistic_update() or
+ btr_cur_update_in_place() will take care of
+ updating IBUF_BITMAP_FREE */
+ BTR_KEEP_IBUF_BITMAP = 32
+};
+
+#include "que0types.h"
+#include "row0types.h"
+
+#define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
+#define btr_cur_get_block(cursor) ((cursor)->page_cur.block)
+#define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec)
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@param cursor b-tree cursor
+@return index */
+#define btr_cur_get_index(cursor) ((cursor)->index())
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor);/*!< in: cursor */
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out] table table definition from the data dictionary
+@return error code
+@retval DB_SUCCESS if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+ ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in] index clustered index that is on its first access
+@param[in] page clustered index root page
+@return whether the page is corrupted */
+bool
+btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+ ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+MY_ATTRIBUTE((warn_unused_result))
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+cursor->up_match and cursor->low_match both will have sensible values.
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
+@param level the tree level of search
+@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that
+ it cannot get compared to the node ptr page number field!
+@param latch RW_S_LATCH or RW_X_LATCH
+@param cursor tree cursor; the cursor page is s- or x-latched, but see also
+ above!
+@param mtr mini-transaction
+@return DB_SUCCESS on success or error code otherwise */
+dberr_t btr_cur_search_to_nth_level(ulint level,
+ const dtuple_t *tuple,
+ rw_lock_type_t rw_latch,
+ btr_cur_t *cursor, mtr_t *mtr);
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction;
+ if this function returns DB_SUCCESS on
+ a leaf page of a secondary index in a
+ compressed tablespace, the caller must
+ mtr_commit(mtr) before latching
+ any further pages */
+ MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ page_cur_t* cursor, /*!< in/out: B-tree page cursor */
+#ifdef UNIV_DEBUG
+ rec_offs* offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+ ulint length, /*!< in: size needed */
+ bool create, /*!< in: true=delete-and-insert,
+ false=update-in-place */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+ btr_cur_update_alloc_zip_func(page_zip,cursor,offsets,len,cr,mtr)
+#else /* UNIV_DEBUG */
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+ btr_cur_update_alloc_zip_func(page_zip,cursor,len,cr,mtr)
+#endif /* UNIV_DEBUG */
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out] rec index record
+@param[in] index the index of the record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] update update vector
+@param[in,out] block index page
+@param[in,out] mtr mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+ const rec_offs *offsets, const upd_t *update,
+ buf_block_t *block, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
+ cursor may become invalid if *big_rec == NULL
+ || !(flags & BTR_KEEP_POS_FLAG) */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: pointer to memory heap
+ that can be emptied */
+ mem_heap_t* entry_heap,
+ /*!< in/out: memory heap for allocating
+ big_rec and the index tuple */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ upd_t* update, /*!< in/out: update vector; this is allowed to
+ also contain trx id and roll ptr fields.
+ Non-updated columns that are moved offpage will
+ be appended to this. */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; must be committed
+ before latching any further pages */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ buf_block_t* block, /*!< in/out: buffer block of the record */
+ rec_t* rec, /*!< in/out: record */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
+ que_thr_t* thr, /*!< in: query thread */
+ const dtuple_t* entry, /*!< in: dtuple for the deleting record */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return whether compression occurred */
+bool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
+ cursor does not stay valid if !adjust and
+ compression occurs */
+ bool adjust, /*!< in: whether the cursor position should be
+ adjusted even when compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ cursor stays valid: if deletion succeeds,
+ on function exit it points to the successor
+ of the deleted record */
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+ mtr_t* mtr) /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/** Delete the node pointer in a parent page.
+@param[in,out] parent cursor pointing to parent record
+@param[in,out] mtr mini-transaction */
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index); /*!< in: index corresponding to page */
+/** Arguments to btr_estimate_n_rows_in_range */
+struct btr_pos_t
+{
+ btr_pos_t(dtuple_t *arg_tuple,
+ page_cur_mode_t arg_mode,
+ page_id_t arg_page_id)
+ :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id)
+ {}
+
+ dtuple_t* tuple; /* Range start or end. May be NULL */
+ page_cur_mode_t mode; /* search mode for range */
+ page_id_t page_id; /* Out: Page where we found the tuple */
+};
+
+/** Estimates the number of rows in a given index range. Do search in the
+left page, then if there are pages between left and right ones, read a few
+pages to the right, if the right page is reached, fetch it and count the exact
+number of rows, otherwise count the estimated(see
+btr_estimate_n_rows_in_range_on_level() for details) number if rows, and
+fetch the right page. If leaves are reached, unlatch non-leaf pages except
+the right leaf parent. After the right leaf page is fetched, commit mtr.
+@param[in] index index
+@param[in] range_start range start
+@param[in] range_end range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+ btr_pos_t *range_start,
+ btr_pos_t *range_end);
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in] rec record
+@param[in] offsets array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+ const rec_t* rec,
+ const rec_offs* offsets);
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(2,3,4,5,6)));
+
+/** Operation code for btr_store_big_rec_extern_fields(). */
+enum blob_op {
+ /** Store off-page columns for a freshly inserted record */
+ BTR_STORE_INSERT = 0,
+ /** Store off-page columns for an insert by update */
+ BTR_STORE_INSERT_UPDATE,
+ /** Store off-page columns for an update */
+ BTR_STORE_UPDATE,
+ /** Store off-page columns for a freshly inserted record by bulk */
+ BTR_STORE_INSERT_BULK
+};
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+ enum blob_op op) /*!< in: operation */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+ btr_pcur_t* pcur, /*!< in: a persistent cursor */
+ rec_offs* offsets, /*!< in/out: rec_get_offsets() on
+ pcur. the "external storage" flags
+ in offsets will correctly correspond
+ to rec when this function returns */
+ const big_rec_t*big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* btr_mtr, /*!< in/out: mtr containing the
+ latches to the clustered index. can be
+ committed and restarted. */
+ enum blob_op op) /*! in: operation code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ buf_block_t* block, /*!< in/out: page of field_ref */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ bool rollback, /*!< in: performing rollback? */
+ mtr_t* local_mtr) /*!< in: mtr containing the latch */
+ MY_ATTRIBUTE((nonnull(1,2,5,8)));
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] buf the field, or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] local_len length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+ byte* buf,
+ ulint len,
+ ulint zip_size,
+ const byte* data,
+ ulint local_len);
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] len length of the whole field
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] local_len length of data
+@param[in,out] heap mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+ ulint* len,
+ const byte* data,
+ ulint zip_size,
+ ulint local_len,
+ mem_heap_t* heap);
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in] rec record in a clustered index; must be
+protected by a lock or a page latch
+@param[in] offset array returned by rec_get_offsets()
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] no field number
+@param[out] len length of the field
+@param[in,out] heap mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint zip_size,
+ ulint no,
+ ulint* len,
+ mem_heap_t* heap);
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \
+ ((srv_page_size * (ulint)((index)->merge_threshold)) / 100)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+struct btr_path_t {
+ /* Assume a page like:
+ records: (inf, a, b, c, d, sup)
+ index of the record: 0, 1, 2, 3, 4, 5
+ */
+
+ /** Index of the record where the page cursor stopped on this level
+ (index in alphabetical order). Value ULINT_UNDEFINED denotes array
+ end. In the above example, if the search stopped on record 'c', then
+ nth_rec will be 3. */
+ ulint nth_rec;
+
+ /** Number of the records on the page, not counting inf and sup.
+ In the above example n_recs will be 4. */
+ ulint n_recs;
+
+ /** Number of the page containing the record. */
+ uint32_t page_no;
+
+ /** Level of the page. If later we fetch the page under page_no
+ and it is no different level then we know that the tree has been
+ reorganized. */
+ ulint page_level;
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+ BTR_CUR_HASH = 1, /*!< successful shortcut using
+ the hash index */
+ BTR_CUR_HASH_FAIL, /*!< failure using hash, success using
+ binary search: the misleading hash
+ reference is stored in the field
+ hash_node, and might be necessary to
+ update */
+ BTR_CUR_BINARY, /*!< success using the binary search */
+ BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to
+ the insert buffer */
+ BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete
+ mark in the insert/delete buffer */
+ BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in
+ the insert/delete buffer */
+ BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_t {
+ page_cur_t page_cur; /*!< page cursor */
+ purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */
+ /*------------------------------*/
+ que_thr_t* thr; /*!< this field is only used
+ when search_leaf()
+ is called for an index entry
+ insertion: the calling query
+ thread is passed here to be
+ used in the insert buffer */
+ /*------------------------------*/
+ /** The following fields are used in
+ search_leaf() to pass information: */
+ /* @{ */
+ enum btr_cur_method flag; /*!< Search method used */
+ ulint tree_height; /*!< Tree height if the search is done
+ for a pessimistic insert or update
+ operation */
+ ulint up_match; /*!< If the search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ the first user record to the right of
+ the cursor record after search_leaf();
+ for the mode PAGE_CUR_GE, the matched
+ fields to the first user record AT THE
+ CURSOR or to the right of it;
+ NOTE that the up_match and low_match
+ values may exceed the correct values
+ for comparison to the adjacent user
+ record if that record is on a
+ different leaf page! (See the note in
+ row_ins_duplicate_error_in_clust.) */
+ ulint up_bytes; /*!< number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint low_match; /*!< if search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ first user record AT THE CURSOR or
+ to the left of it after search_leaf();
+ NOT defined for PAGE_CUR_GE or any
+ other search modes; see also the NOTE
+ in up_match! */
+ ulint low_bytes; /*!< number of matched bytes to the
+ left at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint n_fields; /*!< prefix length used in a hash
+ search if hash_node != NULL */
+ ulint n_bytes; /*!< hash prefix bytes if hash_node !=
+ NULL */
+ ulint fold; /*!< fold value used in the search if
+ flag is BTR_CUR_HASH */
+ /* @} */
+ btr_path_t* path_arr; /*!< in estimating the number of
+ rows in range, we store in this array
+ information of the path through
+ the tree */
+ rtr_info_t* rtr_info; /*!< rtree search info */
+ btr_cur_t() { memset((void*) this, 0, sizeof *this); }
+
+ dict_index_t *index() const { return page_cur.index; }
+ buf_block_t *block() const { return page_cur.block; }
+
+ /** Open the cursor on the first or last record.
+ @param first true=first record, false=last record
+ @param index B-tree
+ @param latch_mode which latches to acquire
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+ mtr_t *mtr);
+
+ /** Search the leaf page record corresponding to a key.
+ @param tuple key to search for, with correct n_fields_cmp
+ @param mode search mode; PAGE_CUR_LE for unique prefix or for inserting
+ @param latch_mode latch mode
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode, mtr_t *mtr);
+
+ /** Search the leaf page record corresponding to a key, exclusively latching
+ all sibling pages on the way.
+ @param tuple key to search for, with correct n_fields_cmp
+ @param mode search mode; PAGE_CUR_LE for unique prefix or for inserting
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+ mtr_t *mtr);
+
+ /** Open the cursor at a random leaf page record.
+ @param offsets temporary memory for rec_get_offsets()
+ @param heap memory heap for rec_get_offsets()
+ @param mtr mini-transaction
+ @return error code */
+ inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
+ mtr_t &mtr);
+};
+
+/** Modify the delete-mark flag of a record.
+@tparam flag the value of the delete-mark flag
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in,out] mtr mini-transaction */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES 100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Sleep this time
+between retries. */
+static const std::chrono::milliseconds BTR_CUR_RETRY_SLEEP_TIME(50);
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID 0U /*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO 4U /*!< page no where stored */
+#define BTR_EXTERN_OFFSET 8U /*!< offset of BLOB header
+ on that page */
+#define BTR_EXTERN_LEN 12U /*!< 8 bytes containing the
+ length of the externally
+ stored part of the BLOB.
+ The 2 highest bits are
+ reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG 128U
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row. In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG 64U
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_non_sea_old;
+/** Number of successful adaptive hash index lookups in
+btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_sea;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+extern uint btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+#include "btr0cur.inl"
+
+#endif
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
new file mode 100644
index 00000000..955cf342
--- /dev/null
+++ b/storage/innobase/include/btr0cur.inl
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\
+if (btr_cur_limit_optimistic_insert_debug > 1\
+ && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\
+ CODE;\
+}
+#else
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor) /*!< out: cursor */
+{
+ page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+ cursor->page_cur.index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const page_t* page;
+
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U,
+ return(FALSE));
+
+ if (!page_has_siblings(page)
+ || page_get_data_size(page)
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
+
+ /* The page fillfactor has dropped below a predefined
+ minimum value OR the level in the B-tree contains just
+ one page: we recommend compression if this is not the
+ root page. */
+
+ return cursor->index()->page
+ != btr_cur_get_block(cursor)->page.id().page_no();
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if (!page_has_siblings(page) || page_get_n_recs(page) < 2
+ || page_get_data_size(page) - rec_size
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
+
+ /* The page fillfactor will drop below a predefined
+ minimum value, OR the level in the B-tree contains just
+ one page, OR the page will become empty: we recommend
+ compression if this is not the root page. */
+
+ return cursor->index()->page
+ == btr_cur_get_block(cursor)->page.id().page_no();
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+ enum blob_op op) /*!< in: operation */
+{
+ switch (op) {
+ case BTR_STORE_INSERT:
+ case BTR_STORE_INSERT_BULK:
+ return(FALSE);
+ case BTR_STORE_INSERT_UPDATE:
+ case BTR_STORE_UPDATE:
+ return(TRUE);
+ }
+
+ ut_ad(0);
+ return(FALSE);
+}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000..0523829b
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,65 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES 32
+
+/** stats in btr_defragment */
+extern Atomic_counter<ulint> btr_defragment_compression_failures;
+extern Atomic_counter<ulint> btr_defragment_failures;
+extern Atomic_counter<ulint> btr_defragment_count;
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index); /*!< Index to find. */
+/** Defragment an index.
+@param pcur persistent cursor
+@param thd current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd);
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table); /*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index);
+
+/* Stop defragmentation.*/
+void btr_defragment_end();
+extern bool btr_defragment_active;
+#endif
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
new file mode 100644
index 00000000..c66a3bfa
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.h
@@ -0,0 +1,459 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "buf0block_hint.h"
+#include "btr0btr.h"
+#include "gis0rtree.h"
+
+/* Relative positions for a stored cursor position */
+enum btr_pcur_pos_t {
+ BTR_PCUR_ON = 1,
+ BTR_PCUR_BEFORE = 2,
+ BTR_PCUR_AFTER = 3,
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+ BTR_PCUR_BEFORE_FIRST_IN_TREE = 4, /* in an empty tree */
+ BTR_PCUR_AFTER_LAST_IN_TREE = 5 /* in an empty tree */
+};
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+ btr_pcur_t* cursor);/*!< in, out: persistent cursor */
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is
+ copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+
+/** Opens an persistent cursor to an index tree without initializing the
+cursor.
+@param tuple tuple on which search done
+@param mode PAGE_CUR_L, ...; NOTE that if the search is made using a
+ unique prefix of a record, mode should be PAGE_CUR_LE, not
+ PAGE_CUR_GE, as the latter may end up on the previous page of
+ the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param cursor memory buffer for persistent cursor
+@param mtr mini-transaction
+@return DB_SUCCESS on success or error code otherwise. */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode,
+ btr_pcur_t *cursor, mtr_t *mtr);
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by mtr_t::commit(). */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr to commit */
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in] pcur persistent cursor
+@param[in] sec_pcur secondary index persistent cursor
+@param[in] mtr mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+ btr_pcur_t* pcur,
+ btr_pcur_t* sec_pcur,
+ mtr_t* mtr);
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return true if the cursor was not before first in tree */
+bool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+dberr_t
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+#define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#define btr_pcur_get_page(cursor) btr_pcur_get_block(cursor)->page.frame
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor);
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor);
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+ btr_pcur_t* cursor); /*!< in/out: persistent cursor */
+
+/** Position state of persistent B-tree cursor. */
+enum pcur_pos_t {
+ /** The persistent cursor is not positioned. */
+ BTR_PCUR_NOT_POSITIONED = 0,
+ /** The persistent cursor was previously positioned.
+ TODO: currently, the state can be BTR_PCUR_IS_POSITIONED,
+ though it really should be BTR_PCUR_WAS_POSITIONED,
+ because we have no obligation to commit the cursor with
+ mtr; similarly latch_mode may be out of date. This can
+ lead to problems if btr_pcur is not used the right way;
+ all current code should be ok. */
+ BTR_PCUR_WAS_POSITIONED,
+ /** The persistent cursor is positioned by optimistic get to the same
+ record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON.
+ It may need adjustment depending on previous/current search direction
+ and rel_pos. */
+ BTR_PCUR_IS_POSITIONED_OPTIMISTIC,
+ /** The persistent cursor is positioned by index search.
+ Or optimistic get for rel_pos == BTR_PCUR_ON. */
+ BTR_PCUR_IS_POSITIONED
+};
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_t
+{
+ /** Return value of restore_position() */
+ enum restore_status {
+ /** cursor position on user rec and points on the record with
+ the same field values as in the stored record */
+ SAME_ALL,
+ /** cursor position is on user rec and points on the record with
+ the same unique field values as in the stored record */
+ SAME_UNIQ,
+ /** cursor position is not on user rec or points on the record
+ with not the same uniq field values as in the stored record */
+ NOT_SAME,
+ /** the index tree is corrupted */
+ CORRUPTED
+ };
+ /** a B-tree cursor */
+ btr_cur_t btr_cur;
+ /** @see BTR_PCUR_WAS_POSITIONED
+ BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
+ depending on the latching state of the page and tree where the cursor
+ is positioned; BTR_NO_LATCHES means that the cursor is not currently
+ positioned:
+ we say then that the cursor is detached; it can be restored to
+ attached if the old position was stored in old_rec */
+ btr_latch_mode latch_mode= BTR_NO_LATCHES;
+ /** if cursor position is stored, contains an initial segment of the
+ latest record cursor was positioned either on, before or after */
+ rec_t *old_rec= nullptr;
+ /** btr_cur.index()->n_core_fields when old_rec was copied */
+ uint16 old_n_core_fields= 0;
+ /** number of fields in old_rec */
+ uint16 old_n_fields= 0;
+ /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
+ whether cursor was on, before, or after the old_rec record */
+ btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0);
+ /** buffer block when the position was stored */
+ buf::Block_hint block_when_stored;
+ /** the modify clock value of the buffer block when the cursor position
+ was stored */
+ ib_uint64_t modify_clock= 0;
+ /** btr_pcur_store_position() and restore_position() state. */
+ enum pcur_pos_t pos_state= BTR_PCUR_NOT_POSITIONED;
+ page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+ /** the transaction, if we know it; otherwise this field is not defined;
+ can ONLY BE USED in error prints in fatal assertion failures! */
+ trx_t *trx_if_known= nullptr;
+ /** a dynamically allocated buffer for old_rec */
+ byte *old_rec_buf= nullptr;
+ /** old_rec_buf size if old_rec_buf is not NULL */
+ ulint buf_size= 0;
+
+ /** Return the index of this persistent cursor */
+ dict_index_t *index() const { return(btr_cur.index()); }
+ MY_ATTRIBUTE((nonnull, warn_unused_result))
+ /** Restores the stored position of a persistent cursor bufferfixing
+ the page and obtaining the specified latches. If the cursor position
+ was saved when the
+ (1) cursor was positioned on a user record: this function restores the
+ position to the last record LESS OR EQUAL to the stored record;
+ (2) cursor was positioned on a page infimum record: restores the
+ position to the last record LESS than the user record which was the
+ successor of the page infimum;
+ (3) cursor was positioned on the page supremum: restores to the first
+ record GREATER than the user record which was the predecessor of the
+ supremum.
+ (4) cursor was positioned before the first or after the last in an
+ empty tree: restores to before first or after the last in the tree.
+ @param latch_mode BTR_SEARCH_LEAF, ...
+ @param mtr mini-transaction
+ @retval SAME_ALL cursor position on user rec and points on
+ the record with the same field values as in the stored record,
+ @retval SAME_UNIQ cursor position is on user rec and points on the
+ record with the same unique field values as in the stored record,
+ @retval NOT_SAME cursor position is not on user rec or points on
+ the record with not the same uniq field values as in the stored
+ @retval CORRUPTED if the index is corrupted */
+ restore_status restore_position(btr_latch_mode latch_mode, mtr_t *mtr);
+
+ /** Open the cursor on the first or last record.
+ @param first true=first record, false=last record
+ @param index B-tree
+ @param latch_mode which latches to acquire
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+ mtr_t *mtr)
+
+ {
+ this->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+ search_mode= first ? PAGE_CUR_G : PAGE_CUR_L;
+ pos_state= BTR_PCUR_IS_POSITIONED;
+ old_rec= nullptr;
+
+ return btr_cur.open_leaf(first, index, this->latch_mode, mtr);
+ }
+};
+
+inline buf_block_t *btr_pcur_get_block(btr_pcur_t *cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ return cursor->btr_cur.page_cur.block;
+}
+
+inline const buf_block_t *btr_pcur_get_block(const btr_pcur_t *cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ return cursor->btr_cur.page_cur.block;
+}
+
+inline rec_t *btr_pcur_get_rec(const btr_pcur_t *cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ return cursor->btr_cur.page_cur.rec;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. */
+inline
+dberr_t
+btr_pcur_open(
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_LE, ... */
+ btr_latch_mode latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+ cursor->search_mode= mode;
+ cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+ cursor->trx_if_known= nullptr;
+ return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/** Open a cursor on the first user record satisfying the search condition;
+in case of no match, after the last index record. */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline
+dberr_t
+btr_pcur_open_on_user_rec(
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+ if (dberr_t err=
+ btr_pcur_open(tuple, PAGE_CUR_GE, latch_mode, cursor, mtr))
+ return err;
+ if (!btr_pcur_is_after_last_on_page(cursor) ||
+ btr_pcur_is_after_last_in_tree(cursor))
+ return DB_SUCCESS;
+ if (dberr_t err= btr_pcur_move_to_next_page(cursor, mtr))
+ return err;
+ return btr_pcur_move_to_next_on_page(cursor) ? DB_SUCCESS : DB_CORRUPTION;
+}
+
+#include "btr0pcur.inl"
diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl
new file mode 100644
index 00000000..b827d70d
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.inl
@@ -0,0 +1,372 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor);
+ ut_ad(cursor->old_rec);
+ ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+ || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(cursor->rel_pos);
+}
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+ ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ return !btr_pcur_is_before_first_on_page(cursor) &&
+ !btr_pcur_is_after_last_on_page(cursor);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return !page_has_prev(btr_pcur_get_page(cursor))
+ && page_cur_is_before_first(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return !page_has_next(btr_pcur_get_page(cursor))
+ && page_cur_is_after_last(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_rec = nullptr;
+ return page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_rec = nullptr;
+
+ return page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_rec = nullptr;
+loop:
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+ if (btr_pcur_is_after_last_in_tree(cursor)
+ || btr_pcur_move_to_next_page(cursor, mtr) != DB_SUCCESS) {
+ return(FALSE);
+ }
+ } else if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(cursor))) {
+ return false;
+ }
+
+ if (btr_pcur_is_on_user_rec(cursor)) {
+
+ return(TRUE);
+ }
+
+ goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_rec= nullptr;
+
+ if (btr_pcur_is_after_last_on_page(cursor))
+ return !btr_pcur_is_after_last_in_tree(cursor) &&
+ btr_pcur_move_to_next_page(cursor, mtr) == DB_SUCCESS;
+ else
+ return !!btr_pcur_move_to_next_on_page(cursor);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr to commit */
+{
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in] pcur persistent cursor
+@param[in] sec_pcur secondary index persistent cursor
+@param[in] mtr mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+ btr_pcur_t* pcur,
+ btr_pcur_t* sec_pcur,
+ mtr_t* mtr)
+{
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(sec_pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+ sec_pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+ sec_pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ pcur->old_rec_buf = NULL;
+ pcur->old_rec = NULL;
+
+ pcur->btr_cur.rtr_info = NULL;
+}
+
+/** Opens an persistent cursor to an index tree without initializing the
+cursor.
+@param tuple tuple on which search done
+@param mode search mode; NOTE that if the search is made using a
+ unique prefix of a record, mode should be PAGE_CUR_LE, not
+ PAGE_CUR_GE, as the latter may end up on the previous page of
+ the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param cursor memory buffer for persistent cursor
+@param mtr mini-transaction
+@return DB_SUCCESS on success or error code otherwise. */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode,
+ btr_pcur_t *cursor, mtr_t *mtr)
+{
+ cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+ cursor->search_mode= mode;
+ cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+ cursor->trx_if_known= nullptr;
+ return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by mtr_t::commit(). */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_free(cursor->old_rec_buf);
+
+ if (cursor->btr_cur.rtr_info)
+ rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
+
+ cursor->btr_cur.rtr_info= nullptr;
+ cursor->old_rec = nullptr;
+ cursor->old_rec_buf = nullptr;
+ cursor->btr_cur.page_cur.rec = nullptr;
+ cursor->btr_cur.page_cur.block = nullptr;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+ cursor->trx_if_known = nullptr;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_set_before_first(btr_pcur_get_block(cursor),
+ btr_pcur_get_page_cur(cursor));
+
+ cursor->old_rec = nullptr;
+}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
new file mode 100644
index 00000000..b75cad10
--- /dev/null
+++ b/storage/innobase/include/btr0sea.h
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "dict0dict.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "ha0ha.h"
+#include "srw_lock.h"
+
+#ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#define btr_search_sys_create() btr_search_sys.create()
+#define btr_search_sys_free() btr_search_sys.free()
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable();
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize= false);
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor);/*!< in: cursor which was just positioned */
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out] index index
+@param[in,out] info index search info
+@param[in] tuple logical record
+@param[in] mode PAGE_CUR_L, ....
+@param[in] latch_mode BTR_SEARCH_LEAF, ...
+@param[out] cursor tree cursor
+@param[in] mtr mini-transaction
+@return whether the search succeeded */
+bool
+btr_search_guess_on_hash(
+ dict_index_t* index,
+ btr_search_t* info,
+ const dtuple_t* tuple,
+ ulint mode,
+ ulint latch_mode,
+ btr_cur_t* cursor,
+ mtr_t* mtr);
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out] new_block destination page
+@param[in,out] block source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+ buf_block_t* new_block,
+ buf_block_t* block);
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out] block block containing index page, s- or x-latched, or an
+ index page for which we know that
+ block->buf_fix_count == 0 or it is an index page which
+ has already been removed from the buf_pool.page_hash
+ i.e.: it is in state BUF_BLOCK_REMOVE_HASH
+@param[in] garbage_collect drop ahi only if the index is marked
+ as freed */
+void btr_search_drop_page_hash_index(buf_block_t* block,
+ bool garbage_collect);
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in] page_id page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in] cursor cursor which was positioned to the place to insert
+ using btr_cur_search_, and the new record has been
+ inserted next to the cursor.
+@param[in] ahi_latch the adaptive hash index latch */
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+ srw_spin_lock *ahi_latch);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out] cursor cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor
+@param[in] ahi_latch the adaptive hash index latch */
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+ srw_spin_lock *ahi_latch);
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in] cursor cursor which was positioned on the record to delete
+ using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t *cursor);
+
+/** Validates the search system.
+@param thd connection, for checking if CHECK TABLE has been killed
+@return true if ok */
+bool btr_search_validate(THD *thd);
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all();
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all();
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all();
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all();
+
+# ifdef UNIV_DEBUG
+/** @return if the index is marked as freed */
+bool btr_search_check_marked_free_index(const buf_block_t *block);
+# endif /* UNIV_DEBUG */
+#else /* BTR_CUR_HASH_ADAPT */
+# define btr_search_sys_create()
+# define btr_search_sys_free()
+# define btr_search_drop_page_hash_index(block, garbage_collect)
+# define btr_search_s_lock_all(index)
+# define btr_search_s_unlock_all(index)
+# define btr_search_info_update(index, cursor)
+# define btr_search_move_or_delete_hash_entries(new_block, block)
+# define btr_search_update_hash_on_insert(cursor, ahi_latch)
+# define btr_search_update_hash_on_delete(cursor)
+# ifdef UNIV_DEBUG
+# define btr_search_check_marked_free_index(block)
+# endif /* UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef BTR_CUR_ADAPT
+/** Create and initialize search info.
+@param[in,out] heap heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the search info of an index */
+static inline btr_search_t* btr_search_get_info(dict_index_t* index)
+{
+ return(index->search_info);
+}
+#endif /* BTR_CUR_ADAPT */
+
+/** The search info struct in an index */
+struct btr_search_t{
+ /* @{ The following fields are not protected by any latch.
+ Unfortunately, this means that they must be aligned to
+ the machine word, i.e., they cannot be turned into bit-fields. */
+ buf_block_t* root_guess;/*!< the root page frame when it was last time
+ fetched, or NULL */
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint hash_analysis; /*!< when this exceeds
+ BTR_SEARCH_HASH_ANALYSIS, the hash
+ analysis starts; this is reset if no
+ success noticed */
+ ibool last_hash_succ; /*!< TRUE if the last search would have
+ succeeded, or did succeed, using the hash
+ index; NOTE that the value here is not exact:
+ it is not calculated for every search, and the
+ calculation itself is not always accurate! */
+ ulint n_hash_potential;
+ /*!< number of consecutive searches
+ which would have succeeded, or did succeed,
+ using the hash index;
+ the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+ /* @} */
+ ulint ref_count; /*!< Number of blocks in this index tree
+ that have search index built
+ i.e. block->index points to this index.
+ Protected by search latch except
+ when during initialization in
+ btr_search_info_create(). */
+
+ /*---------------------- @{ */
+ uint16_t n_fields; /*!< recommended prefix length for hash search:
+ number of full fields */
+ uint16_t n_bytes; /*!< recommended prefix: number of bytes in
+ an incomplete field
+ @see BTR_PAGE_MAX_REC_SIZE */
+ bool left_side; /*!< true or false, depending on whether
+ the leftmost record of several records with
+ the same prefix should be indexed in the
+ hash index */
+ /*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+ ulint n_hash_succ; /*!< number of successful hash searches thus
+ far */
+ ulint n_hash_fail; /*!< number of failed hash searches */
+ ulint n_patt_succ; /*!< number of successful pattern searches thus
+ far */
+ ulint n_searches; /*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#endif /* BTR_CUR_HASH_ADAPT */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_t::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N 1112765
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** The hash index system */
+struct btr_search_sys_t
+{
+ /** Partition of the hash table */
+ struct partition
+ {
+ /** latches protecting hash_table */
+ srw_spin_lock latch;
+ /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
+ hash_table_t table;
+ /** memory heap for table */
+ mem_heap_t *heap;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// nonstandard extension - zero sized array, if perfschema is not compiled
+#pragma warning(disable : 4200)
+#endif
+
+ char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof latch -
+ sizeof table - sizeof heap) &
+ (CPU_LEVEL1_DCACHE_LINESIZE - 1)];
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+ void init()
+ {
+ memset((void*) this, 0, sizeof *this);
+ latch.SRW_LOCK_INIT(btr_search_latch_key);
+ }
+
+ void alloc(ulint hash_size)
+ {
+ table.create(hash_size);
+ heap= mem_heap_create_typed(std::min<ulong>(4096,
+ MEM_MAX_ALLOC_IN_BUF / 2
+ - MEM_BLOCK_HEADER_SIZE
+ - MEM_SPACE_NEEDED(0)),
+ MEM_HEAP_FOR_BTR_SEARCH);
+ }
+
+ void clear()
+ {
+ mem_heap_free(heap);
+ heap= nullptr;
+ ut_free(table.array);
+ }
+
+ void free()
+ {
+ latch.destroy();
+ if (heap)
+ clear();
+ }
+ };
+
+ /** Partitions of the adaptive hash index */
+ partition *parts;
+
+ /** Get an adaptive hash index partition */
+ partition *get_part(index_id_t id, ulint space_id) const
+ {
+ return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts;
+ }
+
+ /** Get an adaptive hash index partition */
+ partition *get_part(const dict_index_t &index) const
+ {
+ ut_ad(!index.table->space ||
+ index.table->space->id == index.table->space_id);
+ return get_part(ulint(index.id), index.table->space_id);
+ }
+
+ /** Get the search latch for the adaptive hash index partition */
+ srw_spin_lock *get_latch(const dict_index_t &index) const
+ { return &get_part(index)->latch; }
+
+ /** Create and initialize at startup */
+ void create()
+ {
+ parts= static_cast<partition*>(ut_malloc(btr_ahi_parts * sizeof *parts,
+ mem_key_ahi));
+ for (ulong i= 0; i < btr_ahi_parts; ++i)
+ parts[i].init();
+ if (btr_search_enabled)
+ btr_search_enable();
+ }
+
+ void alloc(ulint hash_size)
+ {
+ hash_size/= btr_ahi_parts;
+ for (ulong i= 0; i < btr_ahi_parts; ++i)
+ parts[i].alloc(hash_size);
+ }
+
+ /** Clear when disabling the adaptive hash index */
+ void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); }
+
+ /** Free at shutdown */
+ void free()
+ {
+ if (parts)
+ {
+ for (ulong i= 0; i < btr_ahi_parts; ++i)
+ parts[i].free();
+ ut_free(parts);
+ parts= nullptr;
+ }
+ }
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t btr_search_sys;
+
+/** @return number of leaf pages pointed to by the adaptive hash index */
+TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const
+{
+ if (!btr_search_enabled)
+ return 0;
+ srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (latch->is_locked())
+ xabort();
+ ulint ref_count= search_info->ref_count;
+ xend();
+ return ref_count;
+ }
+#endif
+ latch->rd_lock(SRW_LOCK_CALL);
+ ulint ref_count= search_info->ref_count;
+ latch->rd_unlock();
+ return ref_count;
+}
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS 17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT 3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT 3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT 10000
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#include "btr0sea.inl"
+
+#endif
diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl
new file mode 100644
index 00000000..5a8d6480
--- /dev/null
+++ b/storage/innobase/include/btr0sea.inl
@@ -0,0 +1,117 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/** Create and initialize search info.
+@param[in,out] heap heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+{
+ btr_search_t* info = static_cast<btr_search_t*>(
+ mem_heap_zalloc(heap, sizeof(btr_search_t)));
+ ut_d(info->magic_n = BTR_SEARCH_MAGIC_N);
+#ifdef BTR_CUR_HASH_ADAPT
+ info->n_fields = 1;
+ info->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+ return(info);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Updates the search info.
+@param[in,out] info search info
+@param[in,out] cursor cursor which was just positioned */
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor);
+
+/*********************************************************************//**
+Updates the search info. */
+static inline
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor) /*!< in: cursor which was just positioned */
+{
+ ut_ad(!index->is_spatial());
+ ut_ad(!index->table->is_temporary());
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ btr_search_t* info;
+ info = btr_search_get_info(index);
+
+ info->hash_analysis++;
+
+ if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+ /* Do nothing */
+
+ return;
+
+ }
+
+ ut_ad(cursor->flag != BTR_CUR_HASH);
+
+ btr_search_info_update_slow(info, cursor);
+}
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ btr_search_sys.parts[i].latch.wr_lock(SRW_LOCK_CALL);
+ }
+}
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ btr_search_sys.parts[i].latch.wr_unlock();
+ }
+}
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ btr_search_sys.parts[i].latch.rd_lock(SRW_LOCK_CALL);
+ }
+}
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ btr_search_sys.parts[i].latch.rd_unlock();
+ }
+}
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
new file mode 100644
index 00000000..fc829e78
--- /dev/null
+++ b/storage/innobase/include/btr0types.h
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#pragma once
+
+#include "page0types.h"
+#include "rem0types.h"
+
+/** Persistent cursor */
+struct btr_pcur_t;
+/** B-tree cursor */
+struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+struct btr_search_t;
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Is search system enabled.
+Search system is protected by array of latches. */
+extern char btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+extern ulong btr_ahi_parts;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define FIELD_REF_SIZE 20U
+#define BTR_EXTERN_FIELD_REF_SIZE FIELD_REF_SIZE
+
+/** If the data don't exceed the size, the data are stored locally. */
+#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \
+ (BTR_EXTERN_FIELD_REF_SIZE * 2)
+
+/** Latching modes for btr_cur_t::search_leaf(). */
+enum btr_latch_mode {
+ /** Search a record on a leaf page and S-latch it. */
+ BTR_SEARCH_LEAF = RW_S_LATCH,
+ /** (Prepare to) modify a record on a leaf page and X-latch it. */
+ BTR_MODIFY_LEAF = RW_X_LATCH,
+ /** U-latch root and X-latch a leaf page */
+ BTR_MODIFY_ROOT_AND_LEAF = RW_SX_LATCH,
+ /** Obtain no latches. */
+ BTR_NO_LATCHES = RW_NO_LATCH,
+ /** Search the previous record.
+ Used in btr_pcur_move_backward_from_page(). */
+ BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF,
+ /** Modify the previous record.
+ Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */
+ BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF,
+ /** Start modifying the entire B-tree. */
+ BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF,
+ /** Continue modifying the entire R-tree.
+ Only used by rtr_search_to_nth_level(). */
+ BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE,
+
+ /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
+ exclusive. */
+ /** The search tuple will be inserted to the secondary index
+ at the searched position. When the leaf page is not in the
+ buffer pool, try to use the change buffer. */
+ BTR_INSERT = 64,
+
+ /** Try to delete mark a secondary index leaf page record at
+ the searched position using the change buffer when the page is
+ not in the buffer pool. */
+ BTR_DELETE_MARK = 128,
+
+ /** Try to purge the record using the change buffer when the
+ secondary index leaf page is not in the buffer pool. */
+ BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK,
+
+ /** The caller is already holding dict_index_t::lock S-latch. */
+ BTR_ALREADY_S_LATCHED = 256,
+ /** Search and S-latch a leaf page, assuming that the
+ dict_index_t::lock S-latch is being held. */
+ BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Search and X-latch a leaf page, assuming that the
+ dict_index_t::lock is being held in non-exclusive mode. */
+ BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Attempt to modify records in an x-latched tree. */
+ BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE
+ | BTR_ALREADY_S_LATCHED,
+ /** U-latch root and X-latch a leaf page, assuming that
+ dict_index_t::lock is being held in U mode. */
+ BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF
+ | BTR_ALREADY_S_LATCHED,
+
+ /** Attempt to delete-mark a secondary index record. */
+ BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
+ /** Attempt to delete-mark a secondary index record
+ while holding the dict_index_t::lock S-latch. */
+ BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Attempt to purge a secondary index record. */
+ BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
+ /** Attempt to purge a secondary index record
+ while holding the dict_index_t::lock S-latch. */
+ BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
+ | BTR_ALREADY_S_LATCHED,
+
+ /** In the case of BTR_MODIFY_TREE, the caller specifies
+ the intention to delete record only. It is used to optimize
+ block->lock range.*/
+ BTR_LATCH_FOR_DELETE = 512,
+
+ /** In the case of BTR_MODIFY_TREE, the caller specifies
+ the intention to delete record only. It is used to optimize
+ block->lock range.*/
+ BTR_LATCH_FOR_INSERT = 1024,
+
+ /** Attempt to delete a record in the tree. */
+ BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ /** Attempt to delete a record in an x-latched tree. */
+ BTR_PURGE_TREE_ALREADY_LATCHED = BTR_PURGE_TREE
+ | BTR_ALREADY_S_LATCHED,
+
+ /** Attempt to insert a record into the tree. */
+ BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+
+ /** This flag ORed to BTR_INSERT says that we can ignore possible
+ UNIQUE definition on secondary indexes when we decide if we can use
+ the insert buffer to speed up inserts */
+ BTR_IGNORE_SEC_UNIQUE = 2048,
+ /** Rollback in spatial index */
+ BTR_RTREE_UNDO_INS = 4096,
+ /** Try to delete mark a spatial index record */
+ BTR_RTREE_DELETE_MARK = 8192
+};
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
new file mode 100644
index 00000000..d4fee7c1
--- /dev/null
+++ b/storage/innobase/include/buf0block_hint.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+#pragma once
+#include "buf0buf.h"
+
+namespace buf {
+class Block_hint {
+public:
+ /** Stores the pointer to the block, which is currently buffer-fixed.
+ @param block a pointer to a buffer-fixed block to be stored */
+ inline void store(buf_block_t *block)
+ {
+ ut_ad(block->page.buf_fix_count());
+ m_block= block;
+ m_page_id= block->page.id();
+ }
+
+ /** Clears currently stored pointer. */
+ inline void clear() { m_block= nullptr; }
+
+ /** Invoke f on m_block(which may be null)
+ @param f The function to be executed. It will be passed the pointer.
+ If you wish to use the block pointer subsequently,
+ you need to ensure you buffer-fix it before returning from f.
+ @return the return value of f
+ */
+ template <typename F>
+ bool run_with_hint(const F &f)
+ {
+ buffer_fix_block_if_still_valid();
+ /* m_block could be changed during f() call, so we use local
+ variable to remember which block we need to unfix */
+ buf_block_t *block= m_block;
+ bool res= f(block);
+ if (block)
+ block->page.unfix();
+ return res;
+ }
+
+ buf_block_t *block() const { return m_block; }
+
+ private:
+ /** The block pointer stored by store(). */
+ buf_block_t *m_block= nullptr;
+ /** If m_block is non-null, the m_block->page.id at time it was stored. */
+ page_id_t m_page_id{0, 0};
+
+ /** A helper function which checks if m_block is not a dangling pointer and
+ still points to block with page with m_page_id and if so, buffer-fixes it,
+ otherwise clear()s it */
+ void buffer_fix_block_if_still_valid();
+};
+} // namespace buf
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
new file mode 100644
index 00000000..bb999420
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#include "buf0types.h"
+
+/**
+@param[in] block size in bytes
+@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+inline
+ulint
+buf_buddy_get_slot(ulint size)
+{
+ ulint i;
+ ulint s;
+
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size <= srv_page_size);
+
+ for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+ }
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ return i;
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc));
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param size compressed page size in bytes
+@param lru assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr)
+{
+ return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru);
+}
+
+/** Deallocate a block.
+@param[in] buf block to be freed, must not be pointed to
+ by the buffer pool
+@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i);
+
+/** Deallocate a block.
+@param[in] buf block to be freed, must not be pointed to
+ by the buffer pool
+@param[in] size block size in bytes */
+inline void buf_buddy_free(void* buf, ulint size)
+{
+ buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+}
+
+/** Try to reallocate a block.
+@param[in] buf block to be reallocated, must be pointed
+to by the buffer pool
+@param[in] size block size, up to srv_page_size
+@retval false if failed because of no free blocks. */
+bool buf_buddy_realloc(void* buf, ulint size);
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free();
+#endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
new file mode 100644
index 00000000..332b2039
--- /dev/null
+++ b/storage/innobase/include/buf0buf.h
@@ -0,0 +1,2190 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "span.h"
+#include "assume_aligned.h"
+#include "buf0types.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "ut0byte.h"
+#include "page0types.h"
+#include "log0log.h"
+#include "srv0srv.h"
+#include "transactional_lock_guard.h"
+#include <ostream>
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET 10 /*!< get always */
+#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make
+ the block young in the LRU list */
+#define BUF_GET_IF_IN_POOL_OR_WATCH 15
+ /*!< Get the page only if it's in the
+ buffer pool, if not then set a watch
+ on the page. */
+#define BUF_GET_POSSIBLY_FREED 16
+ /*!< Like BUF_GET, but do not mind
+ if the file page has been freed. */
+/* @} */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN 256
+
+/** This structure defines information we will fetch from each buffer pool. It
+will be used to print table IO stats */
+struct buf_pool_info_t
+{
+ /* General buffer pool info */
+ ulint pool_size; /*!< Buffer Pool size in pages */
+ ulint lru_len; /*!< Length of buf_pool.LRU */
+ ulint old_lru_len; /*!< buf_pool.LRU_old_len */
+ ulint free_list_len; /*!< Length of buf_pool.free list */
+ ulint flush_list_len; /*!< Length of buf_pool.flush_list */
+ ulint n_pend_unzip; /*!< buf_pool.n_pend_unzip, pages
+ pending decompress */
+ ulint n_pend_reads; /*!< os_aio_pending_reads() */
+ ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */
+ ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH
+ LIST */
+ ulint n_pages_made_young; /*!< number of pages made young */
+ ulint n_pages_not_made_young; /*!< number of pages not made young */
+ ulint n_pages_read; /*!< buf_pool.n_pages_read */
+ ulint n_pages_created; /*!< buf_pool.n_pages_created */
+ ulint n_pages_written; /*!< buf_pool.n_pages_written */
+ ulint n_page_gets; /*!< buf_pool.n_page_gets */
+ ulint n_ra_pages_read_rnd; /*!< buf_pool.n_ra_pages_read_rnd,
+ number of pages readahead */
+ ulint n_ra_pages_read; /*!< buf_pool.n_ra_pages_read, number
+ of pages readahead */
+ ulint n_ra_pages_evicted; /*!< buf_pool.n_ra_pages_evicted,
+ number of readahead pages evicted
+ without access */
+ ulint n_page_get_delta; /*!< num of buffer pool page gets since
+ last printout */
+
+ /* Buffer pool access stats */
+ double page_made_young_rate; /*!< page made young rate in pages
+ per second */
+ double page_not_made_young_rate;/*!< page not made young rate
+ in pages per second */
+ double pages_read_rate; /*!< num of pages read per second */
+ double pages_created_rate; /*!< num of pages create per second */
+ double pages_written_rate; /*!< num of pages written per second */
+ ulint page_read_delta; /*!< num of pages read since last
+ printout */
+ ulint young_making_delta; /*!< num of pages made young since
+ last printout */
+ ulint not_young_making_delta; /*!< num of pages not make young since
+ last printout */
+
+ /* Statistics about read ahead algorithm. */
+ double pages_readahead_rnd_rate;/*!< random readahead rate in pages per
+ second */
+ double pages_readahead_rate; /*!< readahead rate in pages per
+ second */
+ double pages_evicted_rate; /*!< rate of readahead page evicted
+ without access, in pages per second */
+
+ /* Stats about LRU eviction */
+ ulint unzip_lru_len; /*!< length of buf_pool.unzip_LRU
+ list */
+ /* Counters for LRU policy */
+ ulint io_sum; /*!< buf_LRU_stat_sum.io */
+ ulint io_cur; /*!< buf_LRU_stat_cur.io, num of IO
+ for current interval */
+ ulint unzip_sum; /*!< buf_LRU_stat_sum.unzip */
+ ulint unzip_cur; /*!< buf_LRU_stat_cur.unzip, num
+ pages decompressed in current
+ interval */
+};
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Print the given page_id_t object.
+@param[in,out] out the output stream
+@param[in] page_id the page_id_t object to be printed
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& out,
+ const page_id_t page_id);
+
+#ifndef UNIV_INNOCHECKSUM
+# define buf_pool_get_curr_size() srv_buf_pool_curr_size
+
+/** Allocate a buffer block.
+@return own: the allocated block, state()==MEMORY */
+inline buf_block_t *buf_block_alloc();
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block); /*!< in, own: block to be freed */
+
+#define buf_page_get(ID, SIZE, LA, MTR) \
+ buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
+
+/** Try to acquire a page latch.
+@param rw_latch RW_S_LATCH or RW_X_LATCH
+@param block guessed block
+@param modify_clock expected value of block->modify_clock
+@param mtr mini-transaction
+@return whether the latch was acquired (the page is an allocated file page) */
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+ uint64_t modify_clock, mtr_t *mtr);
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
+@param[in] page_id page identifier
+@param[in,out] mtr mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr);
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with unfix().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ mtr_t* mtr,
+ dberr_t* err = NULL,
+ bool allow_ibuf_merge = false)
+ MY_ATTRIBUTE((nonnull(6), warn_unused_result));
+
+/** This is the low level function used to get access to a database page.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out] mtr mini-transaction, or NULL if a
+ block with page_id is to be evicted
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge);
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED => LRU
+(the other is buf_page_get_low()).
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[in,out] free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
+
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id space identfier
+@param zip_size ROW_FORMAT=COMPRESSED page size or 0
+@param mtr mini-transaction
+@param free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr,
+ buf_block_t *free_block);
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage);
+/** Mark the page status as FREED for the given tablespace and page number.
+@param[in,out] space tablespace
+@param[in] page page number
+@param[in,out] mtr mini-transaction */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr);
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@param[in] bpage buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage);
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in] bpage buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
+
+/** Move a page to the start of the buffer pool LRU list if it is too old.
+@param[in,out] bpage buffer pool page */
+inline void buf_page_make_young_if_needed(buf_page_t *bpage)
+{
+ if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
+ buf_page_make_young(bpage);
+ }
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Check if a buffer is all zeroes.
+@param[in] buf data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(st_::span<const byte> buf);
+
+/** Check if a page is corrupt.
+@param check_lsn whether FIL_PAGE_LSN should be checked
+@param read_buf database page
+@param fsp_flags contents of FIL_SPACE_FLAGS
+@return whether the page is corrupted */
+bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
+ uint32_t fsp_flags)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Read the key version from the page. In full crc32 format,
+key version is stored at {0-3th} bytes. In other format, it is
+stored in 26th position.
+@param[in] read_buf database page
+@param[in] fsp_flags tablespace flags
+@return key version of the page. */
+inline uint32_t buf_page_get_key_version(const byte* read_buf,
+ uint32_t fsp_flags)
+{
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility");
+ return fil_space_t::full_crc32(fsp_flags)
+ ? mach_read_from_4(my_assume_aligned<4>(read_buf))
+ : mach_read_from_4(my_assume_aligned<2>
+ (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
+}
+
+/** Read the compression info from the page. In full crc32 format,
+compression info is at MSB of page type. In other format, it is
+stored in page type.
+@param[in] read_buf database page
+@param[in] fsp_flags tablespace flags
+@return true if page is compressed. */
+inline bool buf_page_is_compressed(const byte* read_buf, uint32_t fsp_flags)
+{
+ uint16_t page_type= fil_page_get_type(read_buf);
+ return fil_space_t::full_crc32(fsp_flags)
+ ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
+ : page_type == FIL_PAGE_PAGE_COMPRESSED;
+}
+
+/** Get the compressed or uncompressed size of a full_crc32 page.
+@param[in] buf page_compressed or uncompressed page
+@param[out] comp whether the page could be compressed
+@param[out] cr whether the page could be corrupted
+@return the payload size in the file page */
+inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
+{
+ uint t = fil_page_get_type(buf);
+ uint page_size = uint(srv_page_size);
+
+ if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) {
+ return page_size;
+ }
+
+ t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER);
+ t <<= 8;
+
+ if (t < page_size) {
+ page_size = t;
+ if (comp) {
+ *comp = true;
+ }
+ } else if (cr) {
+ *cr = true;
+ }
+
+ return page_size;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Dump a page to stderr.
+@param[in] read_buf database page
+@param[in] zip_size compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size = 0)
+ ATTRIBUTE_COLD __attribute__((nonnull));
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check); /*!< in: TRUE=verify the page checksum */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+ FILE* file); /*!< in: file where to print */
+/** Collect buffer pool metadata.
+@param[out] pool_info buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info);
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats();
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate();
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#define buf_block_get_frame(block) (block)->page.frame
+
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+ (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#define is_buf_block_get_page_zip(block) \
+ UNIV_LIKELY_NULL((block)->page.zip.data)
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage buffer page whose read or write was completed
+@param read true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read);
+
+/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
+if needed.
+@param[in] size size in bytes
+@return aligned size */
+ulint
+buf_pool_size_align(
+ ulint size);
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param page page frame
+@param fsp_flags contents of FSP_SPACE_FLAGS
+@return whether the page is encrypted and valid */
+bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags);
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out] page page to update
+@param[in] size compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size);
+
+/** @brief The temporary memory structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+class buf_tmp_buffer_t
+{
+ /** whether this slot is reserved */
+ std::atomic<bool> reserved;
+public:
+ /** For encryption, the data needs to be copied to a separate buffer
+ before it's encrypted&written. The buffer block itself can be replaced
+ while a write of crypt_buf to file is in progress. */
+ byte *crypt_buf;
+ /** buffer for fil_page_compress(), for flushing page_compressed pages */
+ byte *comp_buf;
+ /** pointer to resulting buffer after encryption or compression;
+ not separately allocated memory */
+ byte *out_buf;
+
+ /** Release the slot */
+ void release() { reserved.store(false, std::memory_order_relaxed); }
+
+ /** Acquire the slot
+ @return whether the slot was acquired */
+ bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);}
+
+ /** Allocate a buffer for encryption, decryption or decompression. */
+ void allocate()
+ {
+ if (!crypt_buf)
+ crypt_buf= static_cast<byte*>
+ (aligned_malloc(srv_page_size, srv_page_size));
+ }
+};
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+class buf_pool_t;
+
+class buf_page_t
+{
+ friend buf_pool_t;
+ friend buf_block_t;
+
+ /** @name General fields */
+ /* @{ */
+
+public: // FIXME: fix fil_iterate()
+ /** Page id. Protected by buf_pool.page_hash.lock_get() when
+ the page is in buf_pool.page_hash. */
+ page_id_t id_;
+ /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */
+ buf_page_t *hash;
+private:
+ /** log sequence number of the START of the log entry written of the
+ oldest modification to this block which has not yet been written
+ to the data file;
+
+ 0 if no modifications are pending;
+ 1 if no modifications are pending, but the block is in buf_pool.flush_list;
+ 2 if modifications are pending, but the block is not in buf_pool.flush_list
+ (because id().space() is the temporary tablespace). */
+ Atomic_relaxed<lsn_t> oldest_modification_;
+
+public:
+ /** state() of unused block (in buf_pool.free list) */
+ static constexpr uint32_t NOT_USED= 0;
+ /** state() of block allocated as general-purpose memory */
+ static constexpr uint32_t MEMORY= 1;
+ /** state() of block that is being freed */
+ static constexpr uint32_t REMOVE_HASH= 2;
+ /** smallest state() of a buffer page that is freed in the tablespace */
+ static constexpr uint32_t FREED= 3;
+ /** smallest state() for a block that belongs to buf_pool.LRU */
+ static constexpr uint32_t UNFIXED= 1U << 29;
+ /** smallest state() of a block for which buffered changes may exist */
+ static constexpr uint32_t IBUF_EXIST= 2U << 29;
+ /** smallest state() of a (re)initialized page (no doublewrite needed) */
+ static constexpr uint32_t REINIT= 3U << 29;
+ /** smallest state() for an io-fixed block */
+ static constexpr uint32_t READ_FIX= 4U << 29;
+ /** smallest state() for a write-fixed block */
+ static constexpr uint32_t WRITE_FIX= 5U << 29;
+ /** smallest state() for a write-fixed block with buffered changes */
+ static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29;
+ /** smallest state() for a write-fixed block (no doublewrite was used) */
+ static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29;
+ /** buf_pool.LRU status mask in state() */
+ static constexpr uint32_t LRU_MASK= 7U << 29;
+
+ /** lock covering the contents of frame */
+ block_lock lock;
+ /** pointer to aligned, uncompressed page frame of innodb_page_size */
+ byte *frame;
+ /* @} */
+ /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to)
+ is also protected by buf_pool.mutex;
+ !frame && !zip.data means an active buf_pool.watch */
+ page_zip_des_t zip;
+#ifdef UNIV_DEBUG
+ /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
+ bool in_zip_hash;
+ /** whether this->LRU is in buf_pool.LRU (in_file());
+ protected by buf_pool.mutex */
+ bool in_LRU_list;
+ /** whether this is in buf_pool.page_hash (in_file());
+ protected by buf_pool.mutex */
+ bool in_page_hash;
+ /** whether this->list is in buf_pool.free (state() == NOT_USED);
+ protected by buf_pool.flush_list_mutex */
+ bool in_free_list;
+#endif /* UNIV_DEBUG */
+ /** list member in one of the lists of buf_pool; protected by
+ buf_pool.mutex or buf_pool.flush_list_mutex
+
+ state() == NOT_USED: buf_pool.free or buf_pool.withdraw
+
+ in_file() && oldest_modification():
+ buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
+
+ The contents is undefined if in_file() && !oldest_modification(),
+ or if state() == MEMORY or state() == REMOVE_HASH. */
+ UT_LIST_NODE_T(buf_page_t) list;
+
+ /** @name LRU replacement algorithm fields.
+ Protected by buf_pool.mutex. */
+ /* @{ */
+
+ UT_LIST_NODE_T(buf_page_t) LRU;
+ /*!< node of the LRU list */
+ unsigned old:1; /*!< TRUE if the block is in the old
+ blocks in buf_pool.LRU_old */
+ unsigned freed_page_clock:31;/*!< the value of
+ buf_pool.freed_page_clock
+ when this block was the last
+ time put to the head of the
+ LRU list; a thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ /* @} */
+ Atomic_counter<unsigned> access_time; /*!< time of first access, or
+ 0 if the block was never accessed
+ in the buffer pool.
+
+ For state() == MEMORY
+ blocks, this field can be repurposed
+ for something else.
+
+ When this field counts log records
+ and bytes allocated for recv_sys.pages,
+ the field is protected by
+ recv_sys_t::mutex. */
+ buf_page_t() : id_{0}
+ {
+ static_assert(NOT_USED == 0, "compatibility");
+ memset((void*) this, 0, sizeof *this);
+ }
+
+ buf_page_t(const buf_page_t &b) :
+ id_(b.id_), hash(b.hash),
+ oldest_modification_(b.oldest_modification_),
+ lock() /* not copied */,
+ frame(b.frame), zip(b.zip),
+#ifdef UNIV_DEBUG
+ in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list),
+ in_page_hash(b.in_page_hash), in_free_list(b.in_free_list),
+#endif /* UNIV_DEBUG */
+ list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock),
+ access_time(b.access_time)
+ {
+ lock.init();
+ }
+
+ /** Initialize some more fields */
+ void init(uint32_t state, page_id_t id)
+ {
+ ut_ad(state < REMOVE_HASH || state >= UNFIXED);
+ id_= id;
+ zip.fix= state;
+ oldest_modification_= 0;
+ lock.init();
+ ut_d(in_zip_hash= false);
+ ut_d(in_free_list= false);
+ ut_d(in_LRU_list= false);
+ ut_d(in_page_hash= false);
+ old= 0;
+ freed_page_clock= 0;
+ access_time= 0;
+ }
+
+ void set_os_unused()
+ {
+ MEM_NOACCESS(frame, srv_page_size);
+#ifdef MADV_FREE
+ madvise(frame, srv_page_size, MADV_FREE);
+#endif
+ }
+
+ void set_os_used() const
+ {
+ MEM_MAKE_ADDRESSABLE(frame, srv_page_size);
+ }
+public:
+ const page_id_t &id() const { return id_; }
+ uint32_t state() const { return zip.fix; }
+ uint32_t buf_fix_count() const
+ {
+ uint32_t f= state();
+ ut_ad(f >= FREED);
+ return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f);
+ }
+ /** @return whether this block is read or write fixed;
+ read_complete() or write_complete() will always release
+ the io-fix before releasing U-lock or X-lock */
+ bool is_io_fixed() const
+ { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; }
+ /** @return whether this block is write fixed;
+ write_complete() will always release the write-fix before releasing U-lock */
+ bool is_write_fixed() const { return state() >= WRITE_FIX; }
+ /** @return whether this block is read fixed; this should never hold
+ when a thread is holding the block lock in any mode */
+ bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); }
+
+ /** @return if this belongs to buf_pool.unzip_LRU */
+ bool belongs_to_unzip_LRU() const
+ { return UNIV_LIKELY_NULL(zip.data) && frame; }
+
+ bool is_freed() const
+ { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
+ bool is_ibuf_exist() const
+ {
+ const auto s= state();
+ ut_ad(s >= UNFIXED);
+ ut_ad(s < READ_FIX);
+ return (s & LRU_MASK) == IBUF_EXIST;
+ }
+ bool is_reinit() const { return !(~state() & REINIT); }
+
+ void set_reinit(uint32_t prev_state)
+ {
+ ut_ad(prev_state < READ_FIX);
+ ut_d(const auto s=) zip.fix.fetch_add(REINIT - prev_state);
+ ut_ad(s > prev_state);
+ ut_ad(s < prev_state + UNFIXED);
+ }
+
+ void set_ibuf_exist()
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+ const auto s= state();
+ ut_ad(s >= UNFIXED);
+ ut_ad(s < READ_FIX);
+ ut_ad(s < IBUF_EXIST || s >= REINIT);
+ zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s));
+ }
+ void clear_ibuf_exist()
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+ ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED);
+ ut_ad(s >= IBUF_EXIST);
+ ut_ad(s < REINIT);
+ }
+
+ uint32_t read_unfix(uint32_t s)
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
+ uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
+ ut_ad(old_state >= READ_FIX);
+ ut_ad(old_state < WRITE_FIX);
+ return old_state + (s - READ_FIX);
+ }
+
+ void set_freed(uint32_t prev_state, uint32_t count= 0)
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(prev_state >= UNFIXED);
+ ut_ad(prev_state < READ_FIX);
+ ut_d(auto s=) zip.fix.fetch_sub((prev_state & LRU_MASK) - FREED - count);
+ ut_ad(!((prev_state ^ s) & LRU_MASK));
+ }
+
+ inline void set_state(uint32_t s);
+ inline void set_corrupt_id();
+
+ /** @return the log sequence number of the oldest pending modification
+ @retval 0 if the block is being removed from (or not in) buf_pool.flush_list
+ @retval 1 if the block is in buf_pool.flush_list but not modified
+ @retval 2 if the block belongs to the temporary tablespace and
+ has unwritten changes */
+ lsn_t oldest_modification() const { return oldest_modification_; }
+ /** @return the log sequence number of the oldest pending modification,
+ @retval 0 if the block is definitely not in buf_pool.flush_list
+ @retval 1 if the block is in buf_pool.flush_list but not modified
+ @retval 2 if the block belongs to the temporary tablespace and
+ has unwritten changes */
+ lsn_t oldest_modification_acquire() const
+ { return oldest_modification_.load(std::memory_order_acquire); }
+ /** Set oldest_modification when adding to buf_pool.flush_list */
+ inline void set_oldest_modification(lsn_t lsn);
+ /** Clear oldest_modification after removing from buf_pool.flush_list */
+ inline void clear_oldest_modification();
+ /** Reset the oldest_modification when marking a persistent page freed */
+ void reset_oldest_modification()
+ {
+ ut_ad(oldest_modification() > 2);
+ oldest_modification_.store(1, std::memory_order_release);
+ }
+
+ /** Complete a read of a page.
+ @param node data file
+ @return whether the operation succeeded
+ @retval DB_PAGE_CORRUPTED if the checksum fails
+ @retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+ @retval DB_FAIL if the page contains the wrong ID */
+ dberr_t read_complete(const fil_node_t &node);
+
+ /** Note that a block is no longer dirty, while not removing
+ it from buf_pool.flush_list
+ @param temporary whether the page belongs to the temporary tablespace
+ @param error whether an error may have occurred while writing */
+ inline void write_complete(bool temporary, bool error);
+
+ /** Write a flushable page to a file or free a freeable block.
+ @param evict whether to evict the page on write completion
+ @param space tablespace
+ @return whether a page write was initiated and buf_pool.mutex released */
+ bool flush(bool evict, fil_space_t *space);
+
+ /** Notify that a page in a temporary tablespace has been modified. */
+ void set_temp_modified()
+ {
+ ut_ad(fsp_is_system_temporary(id().space()));
+ ut_ad(in_file());
+ ut_ad((oldest_modification() | 2) == 2);
+ oldest_modification_= 2;
+ }
+
+ /** Prepare to release a file page to buf_pool.free. */
+ void free_file_page()
+ {
+ ut_ad((zip.fix.fetch_sub(REMOVE_HASH - MEMORY)) == REMOVE_HASH);
+ /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+ ut_d(oldest_modification_= 0;)
+ id_= page_id_t(~0ULL);
+ }
+
+ void fix_on_recovery()
+ {
+ ut_d(const auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED - 1);
+ ut_ad(f >= READ_FIX);
+ ut_ad(f < WRITE_FIX);
+ }
+
+ uint32_t fix(uint32_t count= 1)
+ {
+ ut_ad(count);
+ ut_ad(count < IBUF_EXIST);
+ uint32_t f= zip.fix.fetch_add(count);
+ ut_ad(f >= FREED);
+ ut_ad(!((f ^ (f + 1)) & LRU_MASK));
+ return f;
+ }
+
+ uint32_t unfix()
+ {
+ uint32_t f= zip.fix.fetch_sub(1);
+ ut_ad(f > FREED);
+ ut_ad(!((f ^ (f - 1)) & LRU_MASK));
+ return f - 1;
+ }
+
+ /** @return the physical size, in bytes */
+ ulint physical_size() const
+ {
+ return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size;
+ }
+
+ /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+ @retval 0 if not compressed */
+ ulint zip_size() const
+ {
+ return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
+ }
+
+ /** @return the byte offset of the page within a file */
+ os_offset_t physical_offset() const
+ {
+ os_offset_t o= id().page_no();
+ return zip.ssize
+ ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
+ : o << srv_page_size_shift;
+ }
+
+ /** @return whether the block is mapped to a data file */
+ bool in_file() const { return state() >= FREED; }
+
+ /** @return whether the block can be relocated in memory.
+ The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+ inline bool can_relocate() const;
+ /** @return whether the block has been flagged old in buf_pool.LRU */
+ inline bool is_old() const;
+ /** Set whether a block is old in buf_pool.LRU */
+ inline void set_old(bool old);
+ /** Flag a page accessed in buf_pool
+ @return whether this is not the first access */
+ bool set_accessed()
+ {
+ if (is_accessed()) return true;
+ access_time= static_cast<uint32_t>(ut_time_ms());
+ return false;
+ }
+ /** @return ut_time_ms() at the time of first access of a block in buf_pool
+ @retval 0 if not accessed */
+ unsigned is_accessed() const { ut_ad(in_file()); return access_time; }
+};
+
+/** The buffer control block structure */
+
+struct buf_block_t{
+
+ /** @name General fields */
+ /* @{ */
+
+ buf_page_t page; /*!< page information; this must
+ be the first field, so that
+ buf_pool.page_hash can point
+ to buf_page_t or buf_block_t */
+#ifdef UNIV_DEBUG
+ /** whether page.list is in buf_pool.withdraw
+ ((state() == NOT_USED)) and the buffer pool is being shrunk;
+ protected by buf_pool.mutex */
+ bool in_withdraw_list;
+ /** whether unzip_LRU is in buf_pool.unzip_LRU
+ (in_file() && frame && zip.data);
+ protected by buf_pool.mutex */
+ bool in_unzip_LRU_list;
+#endif
+ /** member of buf_pool.unzip_LRU (if belongs_to_unzip_LRU()) */
+ UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+ /* @} */
+ /** @name Optimistic search field */
+ /* @{ */
+
+ ib_uint64_t modify_clock; /*!< this clock is incremented every
+ time a pointer to a record on the
+ page may become obsolete; this is
+ used in the optimistic cursor
+ positioning: if the modify clock has
+ not changed, we know that the pointer
+ is still valid; this field may be
+ changed if the thread (1) owns the
+ pool mutex and the page is not
+ bufferfixed, or (2) the thread has an
+ x-latch on the block */
+ /* @} */
+#ifdef BTR_CUR_HASH_ADAPT
+ /** @name Hash search fields (unprotected)
+ NOTE that these fields are NOT protected by any semaphore! */
+ /* @{ */
+
+ volatile uint16_t n_bytes; /*!< recommended prefix length for hash
+ search: number of bytes in
+ an incomplete last field */
+ volatile uint16_t n_fields; /*!< recommended prefix length for hash
+ search: number of full fields */
+ uint16_t n_hash_helps; /*!< counter which controls building
+ of a new hash index for the page */
+ volatile bool left_side; /*!< true or false, depending on
+ whether the leftmost record of several
+ records with the same prefix should be
+ indexed in the hash index */
+ /* @} */
+
+ /** @name Hash search fields
+ These 5 fields may only be modified when:
+ we are holding the appropriate x-latch in btr_search_latches[], and
+ one of the following holds:
+ (1) in_file(), and we are holding lock in any mode, or
+ (2) !is_read_fixed()&&(state()>=UNFIXED||state()==REMOVE_HASH).
+
+ An exception to this is when we init or create a page
+ in the buffer pool in buf0buf.cc.
+
+ Another exception for buf_pool_t::clear_hash_index() is that
+ assigning block->index = NULL (and block->n_pointers = 0)
+ is allowed whenever all AHI latches are exclusively locked.
+
+ Another exception is that ha_insert_for_fold() may
+ decrement n_pointers without holding the appropriate latch
+ in btr_search_latches[]. Thus, n_pointers must be
+ protected by atomic memory access.
+
+ This implies that the fields may be read without race
+ condition whenever any of the following hold:
+ - the btr_search_sys.partition[].latch is being held, or
+ - state() == NOT_USED || state() == MEMORY,
+ and holding some latch prevents the state from changing to that.
+
+ Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
+ is prone to race conditions while buf_pool_t::clear_hash_index() is
+ executing (the adaptive hash index is being disabled). Such use
+ is explicitly commented. */
+
+ /* @{ */
+
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ Atomic_counter<ulint>
+ n_pointers; /*!< used in debugging: the number of
+ pointers in the adaptive hash index
+ pointing to this frame */
+# define assert_block_ahi_empty(block) \
+ ut_a((block)->n_pointers == 0)
+# define assert_block_ahi_empty_on_init(block) do { \
+ MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \
+ assert_block_ahi_empty(block); \
+} while (0)
+# define assert_block_ahi_valid(block) \
+ ut_a((block)->index || (block)->n_pointers == 0)
+# else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+# define assert_block_ahi_empty(block) /* nothing */
+# define assert_block_ahi_empty_on_init(block) /* nothing */
+# define assert_block_ahi_valid(block) /* nothing */
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ unsigned curr_n_fields:10;/*!< prefix length for hash indexing:
+ number of full fields */
+ unsigned curr_n_bytes:15;/*!< number of bytes in hash
+ indexing */
+ unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+ dict_index_t* index; /*!< Index for which the
+ adaptive hash index has been
+ created, or NULL if the page
+ does not exist in the
+ index. Note that it does not
+ guarantee that the index is
+ complete, though: there may
+ have been hash collisions,
+ record deletions, etc. */
+ /* @} */
+#else /* BTR_CUR_HASH_ADAPT */
+# define assert_block_ahi_empty(block) /* nothing */
+# define assert_block_ahi_empty_on_init(block) /* nothing */
+# define assert_block_ahi_valid(block) /* nothing */
+#endif /* BTR_CUR_HASH_ADAPT */
+ void fix() { page.fix(); }
+ uint32_t unfix() { return page.unfix(); }
+
+ /** @return the physical size, in bytes */
+ ulint physical_size() const { return page.physical_size(); }
+
+ /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+ @retval 0 if not compressed */
+ ulint zip_size() const { return page.zip_size(); }
+
+ /** Initialize the block.
+ @param page_id page identifier
+ @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+ @param state initial state() */
+ void initialise(const page_id_t page_id, ulint zip_size, uint32_t state);
+};
+
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool.zip_hash. */
+/* @{ */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or
+buf_pool.flush_list. A hazard pointer is a buf_page_t pointer
+which we intend to iterate over next and we want it remain valid
+even after we release the mutex that protects the list. */
+class HazardPointer
+{
+public:
+ virtual ~HazardPointer() = default;
+
+ /** @return current value */
+ buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; }
+
+ /** Set current value
+ @param bpage buffer block to be set as hp */
+ void set(buf_page_t *bpage)
+ {
+ mysql_mutex_assert_owner(m_mutex);
+ ut_ad(!bpage || bpage->in_file());
+ m_hp= bpage;
+ }
+
+ /** Checks if a bpage is the hp
+ @param bpage buffer block to be compared
+ @return true if it is hp */
+ bool is_hp(const buf_page_t *bpage) const
+ { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; }
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list. */
+ virtual void adjust(const buf_page_t*) = 0;
+
+#ifdef UNIV_DEBUG
+ /** mutex that protects access to the m_hp. */
+ const mysql_mutex_t *m_mutex= nullptr;
+#endif /* UNIV_DEBUG */
+
+protected:
+ /** hazard pointer */
+ buf_page_t *m_hp= nullptr;
+};
+
+/** Class implementing buf_pool.flush_list hazard pointer */
+class FlushHp : public HazardPointer
+{
+public:
+ ~FlushHp() override = default;
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list.
+ @param bpage buffer block to be compared */
+ MY_ATTRIBUTE((nonnull))
+ void adjust(const buf_page_t *bpage) override
+ {
+ /* We only support reverse traversal for now. */
+ if (is_hp(bpage))
+ m_hp= UT_LIST_GET_PREV(list, m_hp);
+
+ ut_ad(!m_hp || m_hp->oldest_modification());
+ }
+};
+
+/** Class implementing buf_pool.LRU hazard pointer */
+class LRUHp : public HazardPointer {
+public:
+ ~LRUHp() override = default;
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list.
+ @param bpage buffer block to be compared */
+ MY_ATTRIBUTE((nonnull))
+ void adjust(const buf_page_t *bpage) override
+ {
+ /** We only support reverse traversal for now. */
+ if (is_hp(bpage))
+ m_hp= UT_LIST_GET_PREV(LRU, m_hp);
+
+ ut_ad(!m_hp || m_hp->in_LRU_list);
+ }
+};
+
+/** Special purpose iterators to be used when scanning the LRU list.
+The idea is that when one thread finishes the scan it leaves the
+itr in that position and the other thread can start scan from
+there */
+class LRUItr : public LRUHp {
+public:
+ ~LRUItr() override = default;
+
+ /** Select from where to start a scan. If we have scanned
+ too deep into the LRU list it resets the value to the tail
+ of the LRU list.
+ @return buf_page_t from where to start scan. */
+ inline buf_page_t *start();
+};
+
+/** Struct that is embedded in the free zip blocks */
+struct buf_buddy_free_t {
+ union {
+ ulint size; /*!< size of the block */
+ byte bytes[FIL_PAGE_DATA];
+ /*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
+ == BUF_BUDDY_FREE_STAMP denotes a free
+ block. If the space_id field of buddy
+ block != BUF_BUDDY_FREE_STAMP, the block
+ is not in any zip_free list. If the
+ space_id is BUF_BUDDY_FREE_STAMP then
+ stamp[0] will contain the
+ buddy block size. */
+ } stamp;
+
+ buf_page_t bpage; /*!< Embedded bpage descriptor */
+ UT_LIST_NODE_T(buf_buddy_free_t) list;
+ /*!< Node of zip_free list */
+};
+
+/** @brief The buffer pool statistics structure;
+protected by buf_pool.mutex unless otherwise noted. */
+struct buf_pool_stat_t{
+ /** Initialize the counters */
+ void init() { memset((void*) this, 0, sizeof *this); }
+
+ ib_counter_t<ulint, ib_counter_element_t> n_page_gets;
+ /*!< number of page gets performed;
+ also successful searches through
+ the adaptive hash index are
+ counted as page gets;
+ NOT protected by buf_pool.mutex */
+ ulint n_pages_read; /*!< number read operations */
+ ulint n_pages_written;/*!< number write operations */
+ ulint n_pages_created;/*!< number of pages created
+ in the pool with no read */
+ ulint n_ra_pages_read_rnd;/*!< number of pages read in
+ as part of random read ahead */
+ ulint n_ra_pages_read;/*!< number of pages read in
+ as part of read ahead */
+ ulint n_ra_pages_evicted;/*!< number of read ahead
+ pages that are evicted without
+ being accessed */
+ ulint n_pages_made_young; /*!< number of pages made young, in
+ buf_page_make_young() */
+ ulint n_pages_not_made_young; /*!< number of pages not made
+ young because the first access
+ was not long enough ago, in
+ buf_page_peek_if_too_old() */
+ /** number of waits for eviction */
+ ulint LRU_waits;
+ ulint LRU_bytes; /*!< LRU size in bytes */
+};
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_t {
+ /** Number of blocks allocated from the buddy system. */
+ ulint used;
+ /** Number of blocks relocated by the buddy system. */
+ ib_uint64_t relocated;
+ /** Total duration of block relocations, in microseconds. */
+ ib_uint64_t relocated_usec;
+};
+
+/** The buffer pool */
+class buf_pool_t
+{
+ /** A chunk of buffers */
+ struct chunk_t
+ {
+ /** number of elements in blocks[] */
+ size_t size;
+ /** memory allocated for the page frames */
+ unsigned char *mem;
+ /** descriptor of mem */
+ ut_new_pfx_t mem_pfx;
+ /** array of buffer control blocks */
+ buf_block_t *blocks;
+
+ /** Map of first page frame address to chunks[] */
+ using map= std::map<const void*, chunk_t*, std::less<const void*>,
+ ut_allocator<std::pair<const void* const,chunk_t*>>>;
+ /** Chunk map that may be under construction by buf_resize_thread() */
+ static map *map_reg;
+ /** Current chunk map for lookup only */
+ static map *map_ref;
+
+ /** @return the memory size bytes. */
+ size_t mem_size() const { return mem_pfx.m_size; }
+
+ /** Register the chunk */
+ void reg() { map_reg->emplace(map::value_type(blocks->page.frame, this)); }
+
+ /** Allocate a chunk of buffer frames.
+ @param bytes requested size
+ @return whether the allocation succeeded */
+ inline bool create(size_t bytes);
+
+#ifdef UNIV_DEBUG
+ /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+ @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+ @return the block
+ @retval nullptr if not found */
+ const buf_block_t *contains_zip(const void *data) const
+ {
+ const buf_block_t *block= blocks;
+ for (auto i= size; i--; block++)
+ if (block->page.zip.data == data)
+ return block;
+ return nullptr;
+ }
+
+ /** Check that all blocks are in a replaceable state.
+ @return address of a non-free block
+ @retval nullptr if all freed */
+ inline const buf_block_t *not_freed() const;
+#endif /* UNIV_DEBUG */
+ };
+public:
+ /** Hash cell chain in page_hash_table */
+ struct hash_chain
+ {
+ /** pointer to the first block */
+ buf_page_t *first;
+ };
+private:
+ /** Withdraw blocks from the buffer pool until meeting withdraw_target.
+ @return whether retry is needed */
+ inline bool withdraw_blocks();
+
+ /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
+ the buf_block_t itself or a member of it.
+ @param ptr a pointer that will not be dereferenced
+ @return whether the ptr belongs to a buf_block_t struct */
+ bool is_block_field(const void *ptr) const
+ {
+ const chunk_t *chunk= chunks;
+ const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
+
+ /* TODO: protect chunks with a mutex (the older pointer will
+ currently remain during resize()) */
+ for (; chunk < echunk; chunk++)
+ if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
+ ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
+ return true;
+ return false;
+ }
+
+ /** Try to reallocate a control block.
+ @param block control block to reallocate
+ @return whether the reallocation succeeded */
+ inline bool realloc(buf_block_t *block);
+
+public:
+ bool is_initialised() const { return chunks != nullptr; }
+
+ /** Create the buffer pool.
+ @return whether the creation failed */
+ bool create();
+
+ /** Clean up after successful create() */
+ void close();
+
+ /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+ inline void resize();
+
+ /** @return whether resize() is in progress */
+ bool resize_in_progress() const
+ {
+ return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
+ }
+
+ /** @return the current size in blocks */
+ size_t get_n_pages() const
+ {
+ ut_ad(is_initialised());
+ size_t size= 0;
+ for (auto j= ut_min(n_chunks_new, n_chunks); j--; )
+ size+= chunks[j].size;
+ return size;
+ }
+
+ /** Determine whether a frame is intended to be withdrawn during resize().
+ @param ptr pointer within a buf_page_t::frame
+ @return whether the frame will be withdrawn */
+ bool will_be_withdrawn(const byte *ptr) const
+ {
+ ut_ad(n_chunks_new < n_chunks);
+#ifdef SAFE_MUTEX
+ if (resize_in_progress())
+ mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+ for (const chunk_t *chunk= chunks + n_chunks_new,
+ * const echunk= chunks + n_chunks;
+ chunk != echunk; chunk++)
+ if (ptr >= chunk->blocks->page.frame &&
+ ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size)
+ return true;
+ return false;
+ }
+
+ /** Determine whether a block is intended to be withdrawn during resize().
+ @param bpage buffer pool block
+ @return whether the frame will be withdrawn */
+ bool will_be_withdrawn(const buf_page_t &bpage) const
+ {
+ ut_ad(n_chunks_new < n_chunks);
+#ifdef SAFE_MUTEX
+ if (resize_in_progress())
+ mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+ for (const chunk_t *chunk= chunks + n_chunks_new,
+ * const echunk= chunks + n_chunks;
+ chunk != echunk; chunk++)
+ if (&bpage >= &chunk->blocks->page &&
+ &bpage < &chunk->blocks[chunk->size].page)
+ return true;
+ return false;
+ }
+
+ /** Release and evict a corrupted page.
+ @param bpage x-latched page that was found corrupted
+ @param state expected current state of the page */
+ ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state);
+
+ /** Release a memory block to the buffer pool. */
+ ATTRIBUTE_COLD void free_block(buf_block_t *block);
+
+#ifdef UNIV_DEBUG
+ /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+ @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+ @return the block
+ @retval nullptr if not found */
+ const buf_block_t *contains_zip(const void *data) const
+ {
+ mysql_mutex_assert_owner(&mutex);
+ for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
+ chunk != end; chunk++)
+ if (const buf_block_t *block= chunk->contains_zip(data))
+ return block;
+ return nullptr;
+ }
+
+ /** Assert that all buffer pool pages are in a replaceable state */
+ void assert_all_freed();
+#endif /* UNIV_DEBUG */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /** Clear the adaptive hash index on all pages in the buffer pool. */
+ inline void clear_hash_index();
+
+ /** Get a buffer block from an adaptive hash index pointer.
+ This function does not return if the block is not identified.
+ @param ptr pointer to within a page frame
+ @return pointer to block, never NULL */
+ inline buf_block_t *block_from_ahi(const byte *ptr) const;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /**
+ @return the smallest oldest_modification lsn for any page
+ @retval empty_lsn if all modified persistent pages have been flushed */
+ lsn_t get_oldest_modification(lsn_t empty_lsn)
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list))
+ {
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+ lsn_t lsn= bpage->oldest_modification();
+ if (lsn != 1)
+ {
+ ut_ad(lsn > 2);
+ return lsn;
+ }
+ delete_from_flush_list(bpage);
+ }
+ return empty_lsn;
+ }
+
+ /** Determine if a buffer block was created by chunk_t::create().
+ @param block block descriptor (not dereferenced)
+ @return whether block has been created by chunk_t::create() */
+ bool is_uncompressed(const buf_block_t *block) const
+ {
+ return is_block_field(reinterpret_cast<const void*>(block));
+ }
+
+public:
+ /** @return whether the buffer pool contains a page
+ @tparam allow_watch whether to allow watch_is_sentinel()
+ @param page_id page identifier
+ @param chain hash table chain for page_id.fold() */
+ template<bool allow_watch= false>
+ TRANSACTIONAL_INLINE
+ bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
+ {
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
+ buf_page_t *bpage= page_hash.get(page_id, chain);
+ if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
+ {
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(!bpage->zip.data);
+ if (!allow_watch)
+ bpage= nullptr;
+ }
+ return bpage;
+ }
+
+ /** Determine if a block is a sentinel for a buffer pool watch.
+ @param bpage page descriptor
+ @return whether bpage a sentinel for a buffer pool watch */
+ bool watch_is_sentinel(const buf_page_t &bpage)
+ {
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+ page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
+ is_locked());
+#endif /* SAFE_MUTEX */
+ ut_ad(bpage.in_file());
+ if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
+ return false;
+ ut_ad(!bpage.in_zip_hash);
+ ut_ad(!bpage.zip.data);
+ return true;
+ }
+
+ /** Check if a watched page has been read.
+ This may only be called after !watch_set() and before invoking watch_unset().
+ @param id page identifier
+ @return whether the page was read to the buffer pool */
+ TRANSACTIONAL_INLINE
+ bool watch_occurred(const page_id_t id)
+ {
+ hash_chain &chain= page_hash.cell_get(id.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
+ /* The page must exist because watch_set() increments buf_fix_count. */
+ return !watch_is_sentinel(*page_hash.get(id, chain));
+ }
+
+ /** Register a watch for a page identifier.
+ @param id page identifier
+ @param chain page_hash.cell_get(id.fold())
+ @return a buffer page corresponding to id
+ @retval nullptr if the block was not present in page_hash */
+ buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
+
+ /** Stop watching whether a page has been read in.
+ watch_set(id) must have returned nullptr before.
+ @param id page identifier
+ @param chain unlocked hash table chain */
+ void watch_unset(const page_id_t id, hash_chain &chain);
+
+ /** Remove the sentinel block for the watch before replacing it with a
+ real block. watch_unset() or watch_occurred() will notice
+ that the block has been replaced with the real block.
+ @param w sentinel
+ @param chain locked hash table chain
+ @return w->state() */
+ inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain);
+
+ /** @return whether less than 1/4 of the buffer pool is available */
+ TPOOL_SUPPRESS_TSAN
+ bool running_out() const
+ {
+ return !recv_recovery_is_on() &&
+ UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+ n_chunks_new / 4 * chunks->size;
+ }
+
+ /** @return whether the buffer pool has run out */
+ TPOOL_SUPPRESS_TSAN
+ bool ran_out() const
+ { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); }
+
+ /** @return whether the buffer pool is shrinking */
+ inline bool is_shrinking() const
+ {
+ return n_chunks_new < n_chunks;
+ }
+
+#ifdef UNIV_DEBUG
+ /** Validate the buffer pool. */
+ void validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+ /** Write information of the buf_pool to the error log. */
+ void print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+ /** Remove a block from the LRU list.
+ @return the predecessor in the LRU list */
+ buf_page_t *LRU_remove(buf_page_t *bpage)
+ {
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(bpage->in_LRU_list);
+ ut_ad(bpage->in_page_hash);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_file());
+ lru_hp.adjust(bpage);
+ lru_scan_itr.adjust(bpage);
+ ut_d(bpage->in_LRU_list= false);
+ buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+ UT_LIST_REMOVE(LRU, bpage);
+ return prev;
+ }
+
+ /** Number of pages to read ahead */
+ static constexpr uint32_t READ_AHEAD_PAGES= 64;
+
+ /** Buffer pool mutex */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+ /** current statistics; protected by mutex */
+ buf_pool_stat_t stat;
+ /** old statistics; protected by mutex */
+ buf_pool_stat_t old_stat;
+
+ /** @name General fields */
+ /* @{ */
+ ulint curr_pool_size; /*!< Current pool size in bytes */
+ ulint LRU_old_ratio; /*!< Reserve this much of the buffer
+ pool for "old" blocks */
+#ifdef UNIV_DEBUG
+ ulint buddy_n_frames; /*!< Number of frames allocated from
+ the buffer pool to the buddy system */
+ ulint mutex_exit_forbidden; /*!< Forbid release mutex */
+#endif
+ ut_allocator<unsigned char> allocator; /*!< Allocator used for
+ allocating memory for the the "chunks"
+ member. */
+ ulint n_chunks; /*!< number of buffer pool chunks */
+ ulint n_chunks_new; /*!< new number of buffer pool chunks.
+ both n_chunks{,new} are protected under
+ mutex */
+ chunk_t* chunks; /*!< buffer pool chunks */
+ chunk_t* chunks_old; /*!< old buffer pool chunks to be freed
+ after resizing buffer pool */
+ /** current pool size in pages */
+ Atomic_counter<ulint> curr_size;
+ /** read-ahead request size in pages */
+ Atomic_counter<uint32_t> read_ahead_area;
+
+ /** Hash table with singly-linked overflow lists */
+ struct page_hash_table
+ {
+ static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
+ static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63),
+ "not a multiple of 64 bytes");
+
+ /** Number of array[] elements per page_hash_latch.
+ Must be one less than a power of 2. */
+ static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1;
+ static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+ ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+ /** number of payload elements in array[] */
+ Atomic_relaxed<ulint> n_cells;
+ /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
+ hash_chain *array;
+
+ /** Create the hash table.
+ @param n the lower bound of n_cells */
+ void create(ulint n);
+
+ /** Free the hash table. */
+ void free() { aligned_free(array); array= nullptr; }
+
+ /** @return the index of an array element */
+ ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
+ /** @return raw array index converted to padded index */
+ static ulint pad(ulint h)
+ {
+ ulint latches= h / ELEMENTS_PER_LATCH;
+ ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH;
+ return 1 + latches + empty_slots + h;
+ }
+ private:
+ /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+ static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+ /** @return the index of an array element */
+ static ulint calc_hash(ulint fold, ulint n_cells)
+ {
+ return pad(hash(fold, n_cells));
+ }
+ public:
+ /** @return the latch covering a hash table chain */
+ static page_hash_latch &lock_get(hash_chain &chain)
+ {
+ static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
+ "must be one less than a power of 2");
+ const size_t addr= reinterpret_cast<size_t>(&chain);
+ ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
+ return *reinterpret_cast<page_hash_latch*>
+ (addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
+ }
+
+ /** Get a hash table slot. */
+ hash_chain &cell_get(ulint fold) const
+ { return array[calc_hash(fold, n_cells)]; }
+
+ /** Append a block descriptor to a hash bucket chain. */
+ void append(hash_chain &chain, buf_page_t *bpage)
+ {
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(!bpage->hash);
+ ut_d(bpage->in_page_hash= true);
+ buf_page_t **prev= &chain.first;
+ while (*prev)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage;
+ }
+
+ /** Remove a block descriptor from a hash bucket chain. */
+ void remove(hash_chain &chain, buf_page_t *bpage)
+ {
+ ut_ad(bpage->in_page_hash);
+ buf_page_t **prev= &chain.first;
+ while (*prev != bpage)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage->hash;
+ ut_d(bpage->in_page_hash= false);
+ bpage->hash= nullptr;
+ }
+
+ /** Replace a block descriptor with another. */
+ void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage)
+ {
+ ut_ad(old->in_page_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_d(old->in_page_hash= false);
+ ut_ad(bpage->hash == old->hash);
+ old->hash= nullptr;
+ buf_page_t **prev= &chain.first;
+ while (*prev != old)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage;
+ }
+
+ /** Look up a page in a hash bucket chain. */
+ inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const;
+
+ /** Exclusively aqcuire all latches */
+ inline void write_lock_all();
+
+ /** Release all latches */
+ inline void write_unlock_all();
+ };
+
+ /** Hash table of file pages (buf_page_t::in_file() holds),
+ indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
+ page_hash_table page_hash;
+
+ /** map of block->frame to buf_block_t blocks that belong
+ to buf_buddy_alloc(); protected by buf_pool.mutex */
+ hash_table_t zip_hash;
+ Atomic_counter<ulint>
+ n_pend_unzip; /*!< number of pending decompressions */
+
+ time_t last_printout_time;
+ /*!< when buf_print_io was last time
+ called */
+ buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+ /*!< Statistics of buddy system,
+ indexed by block size */
+
+ /* @} */
+
+ /** number of index page splits */
+ Atomic_counter<ulint> pages_split;
+
+ /** @name Page flushing algorithm fields */
+ /* @{ */
+
+ /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
+ and buf_page_t::list pointers when !oldest_modification() */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+ /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
+ FlushHp flush_hp;
+ /** flush_list size in bytes; protected by flush_list_mutex */
+ ulint flush_list_bytes;
+ /** possibly modified persistent pages (a subset of LRU);
+ os_aio_pending_writes() is approximately COUNT(is_write_fixed()) */
+ UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+ /** number of blocks ever added to flush_list;
+ sometimes protected by flush_list_mutex */
+ size_t flush_list_requests;
+
+ TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size)
+ { ut_ad(size); flush_list_requests+= size; }
+private:
+ static constexpr unsigned PAGE_CLEANER_IDLE= 1;
+ static constexpr unsigned FLUSH_LIST_ACTIVE= 2;
+ static constexpr unsigned LRU_FLUSH= 4;
+
+ /** Number of pending LRU flush * LRU_FLUSH +
+ PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */
+ unsigned page_cleaner_status;
+ /** track server activity count for signaling idle flushing */
+ ulint last_activity_count;
+public:
+ /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
+ pthread_cond_t do_flush_list;
+ /** broadcast when !n_flush(); protected by flush_list_mutex */
+ pthread_cond_t done_flush_LRU;
+ /** broadcast when a batch completes; protected by flush_list_mutex */
+ pthread_cond_t done_flush_list;
+
+ /** @return number of pending LRU flush */
+ unsigned n_flush() const
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ return page_cleaner_status / LRU_FLUSH;
+ }
+
+ /** Increment the number of pending LRU flush */
+ inline void n_flush_inc();
+
+ /** Decrement the number of pending LRU flush */
+ inline void n_flush_dec();
+
+ /** Decrement the number of pending LRU flush
+ while holding flush_list_mutex */
+ inline void n_flush_dec_holding_mutex();
+
+ /** @return whether flush_list flushing is active */
+ bool flush_list_active() const
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ return page_cleaner_status & FLUSH_LIST_ACTIVE;
+ }
+
+ void flush_list_set_active()
+ {
+ ut_ad(!flush_list_active());
+ page_cleaner_status+= FLUSH_LIST_ACTIVE;
+ }
+ void flush_list_set_inactive()
+ {
+ ut_ad(flush_list_active());
+ page_cleaner_status-= FLUSH_LIST_ACTIVE;
+ }
+
+ /** @return whether the page cleaner must sleep due to being idle */
+ bool page_cleaner_idle() const noexcept
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ return page_cleaner_status & PAGE_CLEANER_IDLE;
+ }
+
+ /** @return whether the page cleaner may be initiating writes */
+ bool page_cleaner_active() const
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ static_assert(PAGE_CLEANER_IDLE == 1, "efficiency");
+ return page_cleaner_status > PAGE_CLEANER_IDLE;
+ }
+
+ /** Wake up the page cleaner if needed.
+ @param for_LRU whether to wake up for LRU eviction */
+ void page_cleaner_wakeup(bool for_LRU= false);
+
+ /** Register whether an explicit wakeup of the page cleaner is needed */
+ void page_cleaner_set_idle(bool deep_sleep)
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) |
+ (PAGE_CLEANER_IDLE * deep_sleep);
+ }
+
+ /** Update server last activity count */
+ void update_last_activity_count(ulint activity_count)
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ last_activity_count= activity_count;
+ }
+
+ unsigned freed_page_clock;/*!< a sequence number used
+ to count the number of buffer
+ blocks removed from the end of
+ the LRU list; NOTE that this
+ counter may wrap around at 4
+ billion! A thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ /** Cleared when buf_LRU_get_free_block() fails.
+ Set whenever the free list grows, along with a broadcast of done_free.
+ Protected by buf_pool.mutex. */
+ Atomic_relaxed<bool> try_LRU_scan;
+ /* @} */
+
+ /** @name LRU replacement algorithm fields */
+ /* @{ */
+
+ UT_LIST_BASE_NODE_T(buf_page_t) free;
+ /*!< base node of the free
+ block list */
+ /** broadcast each time when the free list grows or try_LRU_scan is set;
+ protected by mutex */
+ pthread_cond_t done_free;
+
+ UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
+ /*!< base node of the withdraw
+ block list. It is only used during
+ shrinking buffer pool size, not to
+ reuse the blocks will be removed */
+
+ ulint withdraw_target;/*!< target length of withdraw
+ block list, when withdrawing */
+
+ /** "hazard pointer" used during scan of LRU while doing
+ LRU list batch. Protected by buf_pool_t::mutex. */
+ LRUHp lru_hp;
+
+ /** Iterator used to scan the LRU list when searching for
+ replacable victim. Protected by buf_pool_t::mutex. */
+ LRUItr lru_scan_itr;
+
+ UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+ /*!< base node of the LRU list */
+
+ buf_page_t* LRU_old; /*!< pointer to the about
+ LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+ oldest blocks in the LRU list;
+ NULL if LRU length less than
+ BUF_LRU_OLD_MIN_LEN;
+ NOTE: when LRU_old != NULL, its length
+ should always equal LRU_old_len */
+ ulint LRU_old_len; /*!< length of the LRU list from
+ the block to which LRU_old points
+ onward, including that block;
+ see buf0lru.cc for the restrictions
+ on this value; 0 if LRU_old == NULL;
+ NOTE: LRU_old_len must be adjusted
+ whenever LRU_old shrinks or grows! */
+
+ UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+ /*!< base node of the
+ unzip_LRU list */
+
+ /* @} */
+ /** free ROW_FORMAT=COMPRESSED page frames */
+ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
+#endif
+
+ /** Sentinels to detect if pages are read into the buffer pool while
+ a delete-buffering operation is pending. Protected by mutex. */
+ buf_page_t watch[innodb_purge_threads_MAX + 1];
+ /** Reserve a buffer. */
+ buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
+
+ /** Remove a block from flush_list.
+ @param bpage buffer pool page */
+ void delete_from_flush_list(buf_page_t *bpage) noexcept;
+
+ /** Prepare to insert a modified blcok into flush_list.
+ @param lsn start LSN of the mini-transaction
+ @return insert position for insert_into_flush_list() */
+ inline buf_page_t *prepare_insert_into_flush_list(lsn_t lsn) noexcept;
+
+ /** Insert a modified block into the flush list.
+ @param prev insert position (from prepare_insert_into_flush_list())
+ @param block modified block
+ @param lsn start LSN of the mini-transaction that modified the block */
+ inline void insert_into_flush_list(buf_page_t *prev, buf_block_t *block,
+ lsn_t lsn) noexcept;
+
+ /** Free a page whose underlying file page has been freed. */
+ ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept;
+
+private:
+ /** Temporary memory for page_compressed and encrypted I/O */
+ struct io_buf_t
+ {
+ /** number of elements in slots[] */
+ ulint n_slots;
+ /** array of slots */
+ buf_tmp_buffer_t *slots;
+
+ void create(ulint n_slots);
+
+ void close();
+
+ /** Reserve a buffer */
+ buf_tmp_buffer_t *reserve();
+ } io_buf;
+
+ /** whether resize() is in the critical path */
+ std::atomic<bool> resizing;
+};
+
+/** The InnoDB buffer pool */
+extern buf_pool_t buf_pool;
+
+inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id,
+ const hash_chain &chain)
+ const
+{
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) ||
+ lock_get(const_cast<hash_chain&>(chain)).is_locked());
+#endif /* SAFE_MUTEX */
+ for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash)
+ {
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage->in_file());
+ if (bpage->id() == id)
+ return bpage;
+ }
+ return nullptr;
+}
+
+#ifdef SUX_LOCK_GENERIC
+inline void page_hash_latch::lock_shared()
+{
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
+ if (!read_trylock())
+ read_lock_wait();
+}
+
+inline void page_hash_latch::lock()
+{
+ if (!write_trylock())
+ write_lock_wait();
+}
+#endif /* SUX_LOCK_GENERIC */
+
+inline void buf_page_t::set_state(uint32_t s)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(s <= REMOVE_HASH || s >= UNFIXED);
+ ut_ad(s < WRITE_FIX);
+ ut_ad(s <= READ_FIX || zip.fix == READ_FIX);
+ zip.fix= s;
+}
+
+inline void buf_page_t::set_corrupt_id()
+{
+#ifdef UNIV_DEBUG
+ switch (oldest_modification()) {
+ case 0:
+ break;
+ case 2:
+ ut_ad(fsp_is_system_temporary(id().space()));
+ /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+ ut_d(oldest_modification_= 0;)
+ break;
+ default:
+ ut_ad("block is dirty" == 0);
+ }
+ const auto f= state();
+ if (f != REMOVE_HASH)
+ {
+ ut_ad(f >= UNFIXED);
+ ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+ is_write_locked());
+ }
+#endif
+ id_.set_corrupted();
+}
+
+/** Set oldest_modification when adding to buf_pool.flush_list */
+inline void buf_page_t::set_oldest_modification(lsn_t lsn)
+{
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+ ut_ad(oldest_modification() <= 1);
+ ut_ad(lsn > 2);
+ oldest_modification_= lsn;
+}
+
+/** Clear oldest_modification after removing from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification()
+{
+#ifdef SAFE_MUTEX
+ if (oldest_modification() != 2)
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+#endif /* SAFE_MUTEX */
+ ut_d(const auto s= state());
+ ut_ad(s >= REMOVE_HASH);
+ ut_ad(oldest_modification());
+ ut_ad(!list.prev);
+ ut_ad(!list.next);
+ /* We must use release memory order to guarantee that callers of
+ oldest_modification_acquire() will observe the block as
+ being detached from buf_pool.flush_list, after reading the value 0. */
+ oldest_modification_.store(0, std::memory_order_release);
+}
+
+/** @return whether the block can be relocated in memory.
+The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+inline bool buf_page_t::can_relocate() const
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ const auto f= state();
+ ut_ad(f >= FREED);
+ ut_ad(in_LRU_list);
+ return (f == FREED || (f < READ_FIX && !(f & ~LRU_MASK))) &&
+ !lock.is_locked_or_waiting();
+}
+
+/** @return whether the block has been flagged old in buf_pool.LRU */
+inline bool buf_page_t::is_old() const
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(in_file());
+ ut_ad(in_LRU_list);
+ return old;
+}
+
+/** Set whether a block is old in buf_pool.LRU */
+inline void buf_page_t::set_old(bool old)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+ ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr));
+ /* If a block is flagged "old", the LRU_old list must exist. */
+ ut_a(!old || buf_pool.LRU_old);
+
+ if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this))
+ {
+ const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this);
+ const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this);
+ if (prev->old == next->old)
+ ut_a(prev->old == old);
+ else
+ {
+ ut_a(!prev->old);
+ ut_a(buf_pool.LRU_old == (old ? this : next));
+ }
+ }
+#endif /* UNIV_LRU_DEBUG */
+
+ this->old= old;
+}
+
+#ifdef UNIV_DEBUG
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do { \
+ mysql_mutex_assert_owner(&buf_pool.mutex); \
+ buf_pool.mutex_exit_forbidden++; \
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do { \
+ mysql_mutex_assert_owner(&buf_pool.mutex); \
+ ut_ad(buf_pool.mutex_exit_forbidden--); \
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+#endif
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED: is in free list, not LRU, not flush_list, nor page_hash
+MEMORY: is not in any of free, LRU, flush_list, page_hash
+in_file(): is not in free list, is in LRU list, id() is defined,
+ is in page_hash (not necessarily if is_read_fixed())
+
+ is in buf_pool.flush_list, if and only
+ if oldest_modification == 1 || oldest_modification > 2
+
+ (1) if is_write_fixed(): is u-locked
+ (2) if is_read_fixed(): is x-locked
+
+State transitions:
+
+NOT_USED => MEMORY
+MEMORY => NOT_USED
+MEMORY => UNFIXED
+UNFIXED => in_file()
+in_file() => UNFIXED or FREED
+UNFIXED or FREED => REMOVE_HASH
+REMOVE_HASH => NOT_USED (if and only if !oldest_modification())
+*/
+
+/** Select from where to start a scan. If we have scanned
+too deep into the LRU list it resets the value to the tail
+of the LRU list.
+@return buf_page_t from where to start scan. */
+inline buf_page_t *LRUItr::start()
+{
+ mysql_mutex_assert_owner(m_mutex);
+
+ if (!m_hp || m_hp->old)
+ m_hp= UT_LIST_GET_LAST(buf_pool.LRU);
+
+ return m_hp;
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the LRU list. */
+struct CheckInLRUList {
+ void operator()(const buf_page_t* elem) const
+ {
+ ut_a(elem->in_LRU_list);
+ }
+
+ static void validate()
+ {
+ ut_list_validate(buf_pool.LRU, CheckInLRUList());
+ }
+};
+
+/** Functor to validate the LRU list. */
+struct CheckInFreeList {
+ void operator()(const buf_page_t* elem) const
+ {
+ ut_a(elem->in_free_list);
+ }
+
+ static void validate()
+ {
+ ut_list_validate(buf_pool.free, CheckInFreeList());
+ }
+};
+
+struct CheckUnzipLRUAndLRUList {
+ void operator()(const buf_block_t* elem) const
+ {
+ ut_a(elem->page.in_LRU_list);
+ ut_a(elem->in_unzip_LRU_list);
+ }
+
+ static void validate()
+ {
+ ut_list_validate(buf_pool.unzip_LRU,
+ CheckUnzipLRUAndLRUList());
+ }
+};
+#endif /* UNIV_DEBUG */
+
+#include "buf0buf.inl"
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
new file mode 100644
index 00000000..b3158cf1
--- /dev/null
+++ b/storage/innobase/include/buf0buf.inl
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+The page must be either buffer-fixed, or its page hash must be locked.
+@param[in] bpage buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage)
+{
+ /* FIXME: bpage->freed_page_clock is 31 bits */
+ return((buf_pool.freed_page_clock & ((1UL << 31) - 1))
+ < (bpage->freed_page_clock
+ + (buf_pool.curr_size
+ * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio)
+ / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in] bpage buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
+{
+ if (buf_pool.freed_page_clock == 0) {
+ /* If eviction has not started yet, do not update the
+ statistics or move blocks in the LRU list. This is
+ either the warm-up phase or an in-memory workload. */
+ return(FALSE);
+ } else if (buf_LRU_old_threshold_ms && bpage->old) {
+ uint32_t access_time = bpage->is_accessed();
+
+ /* It is possible that the below comparison returns an
+ unexpected result. 2^32 milliseconds pass in about 50 days,
+ so if the difference between ut_time_ms() and access_time
+ is e.g. 50 days + 15 ms, then the below will behave as if
+ it is 15 ms. This is known and fixing it would require to
+ increase buf_page_t::access_time from 32 to 64 bits. */
+ if (access_time
+ && ((ib_uint32_t) (ut_time_ms() - access_time))
+ >= buf_LRU_old_threshold_ms) {
+ return(TRUE);
+ }
+
+ buf_pool.stat.n_pages_not_made_young++;
+ return false;
+ } else {
+ return !buf_page_peek_if_young(bpage);
+ }
+}
+
+/** Allocate a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+inline buf_block_t *buf_block_alloc()
+{
+ return buf_LRU_get_free_block(false);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block) /*!< in, own: block to be freed */
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef SAFE_MUTEX
+ ut_ad((mysql_mutex_is_owner(&buf_pool.mutex)
+ && !block->page.buf_fix_count())
+ || block->page.lock.have_u_or_x());
+#else /* SAFE_MUTEX */
+ ut_ad(!block->page.buf_fix_count() || block->page.lock.have_u_or_x());
+#endif /* SAFE_MUTEX */
+ assert_block_ahi_valid(block);
+
+ block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+ ut_ad(block->page.lock.have_any());
+ return(block->modify_clock);
+}
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
new file mode 100644
index 00000000..d9f03177
--- /dev/null
+++ b/storage/innobase/include/buf0checksum.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in] page buffer page (srv_page_size bytes)
+@return CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page);
+
+#ifndef UNIV_INNOCHECKSUM
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page);
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page);
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
new file mode 100644
index 00000000..9932b0e5
--- /dev/null
+++ b/storage/innobase/include/buf0dblwr.h
@@ -0,0 +1,164 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#pragma once
+
+#include "os0file.h"
+#include "buf0types.h"
+
+/** Doublewrite control struct */
+class buf_dblwr_t
+{
+ struct element
+ {
+ /** asynchronous write request */
+ IORequest request;
+ /** payload size in bytes */
+ size_t size;
+ };
+
+ struct slot
+ {
+ /** first free position in write_buf measured in units of
+ * srv_page_size */
+ ulint first_free;
+ /** number of slots reserved for the current write batch */
+ ulint reserved;
+ /** the doublewrite buffer, aligned to srv_page_size */
+ byte* write_buf;
+ /** buffer blocks to be written via write_buf */
+ element* buf_block_arr;
+ };
+
+ /** the page number of the first doublewrite block (block_size() pages) */
+ page_id_t block1{0, 0};
+ /** the page number of the second doublewrite block (block_size() pages) */
+ page_id_t block2{0, 0};
+
+ /** mutex protecting the data members below */
+ mysql_mutex_t mutex;
+ /** condition variable for !batch_running */
+ pthread_cond_t cond;
+ /** whether a batch is being written from the doublewrite buffer */
+ bool batch_running;
+ /** number of expected flush_buffered_writes_completed() calls */
+ unsigned flushing_buffered_writes;
+ /** number of flush_buffered_writes_completed() calls */
+ ulint writes_completed;
+ /** number of pages written by flush_buffered_writes_completed() */
+ ulint pages_written;
+
+ slot slots[2];
+ slot *active_slot;
+
+ /** Initialise the persistent storage of the doublewrite buffer.
+ @param header doublewrite page header in the TRX_SYS page */
+ inline void init(const byte *header);
+
+ /** Flush possible buffered writes to persistent storage. */
+ bool flush_buffered_writes(const ulint size);
+
+public:
+ /** Initialise the doublewrite buffer data structures. */
+ void init();
+ /** Create or restore the doublewrite buffer in the TRX_SYS page.
+ @return whether the operation succeeded */
+ bool create();
+ /** Free the doublewrite buffer. */
+ void close();
+
+ /** Acquire the mutex */
+ void lock() { mysql_mutex_lock(&mutex); }
+ /** @return the number of completed batches */
+ ulint batches() const
+ { mysql_mutex_assert_owner(&mutex); return writes_completed; }
+ /** @return the number of final pages written */
+ ulint written() const
+ { mysql_mutex_assert_owner(&mutex); return pages_written; }
+ /** Release the mutex */
+ void unlock() { mysql_mutex_unlock(&mutex); }
+
+ /** Initialize the doublewrite buffer memory structure on recovery.
+ If we are upgrading from a version before MySQL 4.1, then this
+ function performs the necessary update operations to support
+ innodb_file_per_table. If we are in a crash recovery, this function
+ loads the pages from double write buffer into memory.
+ @param file File handle
+ @param path Path name of file
+ @return DB_SUCCESS or error code */
+ dberr_t init_or_load_pages(pfs_os_file_t file, const char *path);
+
+ /** Process and remove the double write buffer pages for all tablespaces. */
+ void recover();
+
+ /** Update the doublewrite buffer on data page write completion. */
+ void write_completed();
+ /** Flush possible buffered writes to persistent storage.
+ It is very important to call this function after a batch of writes has been
+ posted, and also when we may have to wait for a page latch!
+ Otherwise a deadlock of threads can occur. */
+ void flush_buffered_writes();
+ /** Update the doublewrite buffer on write batch completion
+ @param request the completed batch write request */
+ void flush_buffered_writes_completed(const IORequest &request);
+
+ /** Size of the doublewrite block in pages */
+ uint32_t block_size() const { return FSP_EXTENT_SIZE; }
+
+ /** Schedule a page write. If the doublewrite memory buffer is full,
+ flush_buffered_writes() will be invoked to make space.
+ @param request asynchronous write request
+ @param size payload size in bytes */
+ void add_to_batch(const IORequest &request, size_t size);
+
+ /** Determine whether the doublewrite buffer has been created */
+ bool is_created() const
+ { return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
+
+ /** @return whether a page identifier is part of the doublewrite buffer */
+ bool is_inside(const page_id_t id) const
+ {
+ if (!is_created())
+ return false;
+ ut_ad(block1 < block2);
+ if (id < block1)
+ return false;
+ const uint32_t size= block_size();
+ return id < block1 + size || (id >= block2 && id < block2 + size);
+ }
+
+ /** Wait for flush_buffered_writes() to be fully completed */
+ void wait_flush_buffered_writes()
+ {
+ mysql_mutex_lock(&mutex);
+ while (batch_running)
+ my_cond_wait(&cond, &mutex.m_mutex);
+ mysql_mutex_unlock(&mutex);
+ }
+};
+
+/** The doublewrite buffer */
+extern buf_dblwr_t buf_dblwr;
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
new file mode 100644
index 00000000..48586900
--- /dev/null
+++ b/storage/innobase/include/buf0dump.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start();
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start();
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort();
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup();
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end();
+
+#endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
new file mode 100644
index 00000000..0cce514b
--- /dev/null
+++ b/storage/innobase/include/buf0flu.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0byte.h"
+#include "log0log.h"
+#include "buf0buf.h"
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_pool.stat.n_pages_written. */
+extern ulint buf_lru_flush_page_count;
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+extern ulint buf_lru_freed_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern Atomic_relaxed<bool> buf_page_cleaner_is_active;
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id tablespace identifier */
+void buf_flush_remove_pages(uint32_t id);
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage); /*!< in/out: destination block */
+
+/** Complete write of a file page from buf_pool.
+@param request write request
+@param error whether the write may have failed */
+void buf_page_write_complete(const IORequest &request, bool error);
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out] page page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page);
+
+/** Initialize a page for writing to the tablespace.
+@param[in] block buffer block; NULL if bypassing the buffer pool
+@param[in,out] page page frame
+@param[in,out] page_zip_ compressed page, or NULL if uncompressed
+@param[in] use_full_checksum whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+ const buf_block_t* block,
+ byte* page,
+ void* page_zip_,
+ bool use_full_checksum);
+
+/** Try to flush dirty pages that belong to a given tablespace.
+@param space tablespace
+@param n_flushed number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n wished maximum mumber of blocks flushed
+@param evict whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n, bool evict);
+
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end();
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool();
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+void buf_flush_validate();
+#endif /* UNIV_DEBUG */
+
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn);
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync();
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
new file mode 100644
index 00000000..aec08e77
--- /dev/null
+++ b/storage/innobase/include/buf0lru.h
@@ -0,0 +1,193 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "buf0types.h"
+#include "hash0hash.h"
+
+// Forward declaration
+struct trx_t;
+struct fil_space_t;
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+extern size_t innodb_lru_flush_size;
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage block to be freed
+@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+ MY_ATTRIBUTE((nonnull));
+
+/** Try to free a replaceable block.
+@param limit maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
+
+/** @return a buffer block from the buf_pool.free list
+@retval NULL if the free list is empty */
+buf_block_t* buf_LRU_get_free_only();
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+ * get a block from the buf_pool.free list, success:done
+ * if buf_pool.try_LRU_scan is set
+ * scan LRU up to 100 pages to free a clean block
+ * success:retry the free list
+ * flush up to innodb_lru_flush_size LRU blocks to data files
+ (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+ * on buf_page_write_complete() the blocks will put on buf_pool.free list
+ * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+ * scan whole LRU list
+ * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t* buf_LRU_get_free_block(bool have_mutex)
+ MY_ATTRIBUTE((malloc,warn_unused_result));
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU();
+
+/** Puts a block back to the free list.
+@param[in] block block; not containing a file page */
+void
+buf_LRU_block_free_non_file_page(buf_block_t* block);
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+/*==============*/
+ buf_page_t* bpage, /*!< in: control block */
+ bool old); /*!< in: true if should be put to the old
+ blocks in the LRU list, else put to the
+ start; if the LRU list is very short, added to
+ the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old); /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in] old_pct Reserve this percentage of
+ the buffer pool for "old" blocks
+@param[in] adjust true=adjust the LRU list;
+ false=just assign buf_pool.LRU_old_ratio
+ during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust);
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update();
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** The denominator of buf_pool.LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV 1024
+/** Maximum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN 51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago. Not protected by any mutex or latch. */
+extern uint buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics we decide
+if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */
+struct buf_LRU_stat_t
+{
+ ulint io; /**< Counter of buffer pool I/O operations. */
+ ulint unzip; /**< Counter of page_zip_decompress operations. */
+};
+
+/** Current operation counters. Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Protected by buf_pool.mutex. */
+extern buf_LRU_stat_t buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
new file mode 100644
index 00000000..3dd085dd
--- /dev/null
+++ b/storage/innobase/include/buf0rea.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "buf0buf.h"
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param page_id page id
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out] space tablespace
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+ ulint zip_size)
+ MY_ATTRIBUTE((nonnull));
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in] page_id page id of a page which the current thread
+wants to access
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in] page_id page id; see NOTE 3 above
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether if we are inside ibuf routine
+@return number of page read requests issued */
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Schedule a page for recovery.
+@param space tablespace
+@param page_id page identifier
+@param recs log records
+@param init page initialization, or nullptr if the page needs to be read */
+void buf_read_recover(fil_space_t *space, const page_id_t page_id,
+ page_recv_t &recs, recv_init *init);
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY 131
+/** read any page */
+#define BUF_READ_ANY_PAGE 132
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
new file mode 100644
index 00000000..6c13f5ee
--- /dev/null
+++ b/storage/innobase/include/buf0types.h
@@ -0,0 +1,235 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+
+/** Buffer page (uncompressed or compressed) */
+class buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+struct buf_block_t;
+/** Buffer pool statistics struct */
+struct buf_pool_stat_t;
+/** Buffer pool buddy statistics struct */
+struct buf_buddy_stat_t;
+
+/** A buffer frame. @see page_t */
+typedef byte buf_frame_t;
+
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_t {
+ /** Write crc32; allow full_crc32,crc32,innodb,none when reading */
+ SRV_CHECKSUM_ALGORITHM_CRC32,
+ /** Write crc32; allow full_crc23,crc32 when reading */
+ SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,
+ /** For new files, always compute CRC-32C for the whole page.
+ For old files, allow crc32, innodb or none when reading. */
+ SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+ /** For new files, always compute CRC-32C for the whole page.
+ For old files, allow crc32 when reading. */
+ SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+};
+
+inline bool is_checksum_strict(srv_checksum_algorithm_t algo)
+{
+ return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
+}
+
+inline bool is_checksum_strict(ulint algo)
+{
+ return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
+}
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT UNIV_ZIP_SIZE_SHIFT_MIN
+
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT)
+
+/** Actual number of buddy sizes based on current page size */
+#define BUF_BUDDY_SIZES (srv_page_size_shift - BUF_BUDDY_LOW_SHIFT)
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \
+ - BUF_BUDDY_LOW_SHIFT)
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to srv_page_size */
+#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+/** Page identifier. */
+class page_id_t
+{
+public:
+ /** Constructor from (space, page_no).
+ @param space tablespace id
+ @param page_no page number */
+ constexpr page_id_t(uint32_t space, uint32_t page_no) :
+ m_id(uint64_t{space} << 32 | page_no) {}
+
+ constexpr page_id_t(uint64_t id) : m_id(id) {}
+ constexpr bool operator==(const page_id_t& rhs) const
+ { return m_id == rhs.m_id; }
+ constexpr bool operator!=(const page_id_t& rhs) const
+ { return m_id != rhs.m_id; }
+ constexpr bool operator<(const page_id_t& rhs) const
+ { return m_id < rhs.m_id; }
+ constexpr bool operator>(const page_id_t& rhs) const
+ { return m_id > rhs.m_id; }
+ constexpr bool operator<=(const page_id_t& rhs) const
+ { return m_id <= rhs.m_id; }
+ constexpr bool operator>=(const page_id_t& rhs) const
+ { return m_id >= rhs.m_id; }
+ page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; }
+ page_id_t &operator++()
+ {
+ ut_ad(page_no() < 0xFFFFFFFFU);
+ m_id++;
+ return *this;
+ }
+ page_id_t operator-(uint32_t i) const
+ {
+ ut_ad(page_no() >= i);
+ return page_id_t(m_id - i);
+ }
+ page_id_t operator+(uint32_t i) const
+ {
+ ut_ad(page_no() < ~i);
+ return page_id_t(m_id + i);
+ }
+
+ /** Retrieve the tablespace id.
+ @return tablespace id */
+ constexpr uint32_t space() const { return static_cast<uint32_t>(m_id >> 32); }
+
+ /** Retrieve the page number.
+ @return page number */
+ constexpr uint32_t page_no() const { return static_cast<uint32_t>(m_id); }
+
+ /** Retrieve the fold value.
+ @return fold value */
+ constexpr ulint fold() const
+ { return (ulint{space()} << 20) + space() + page_no(); }
+
+ /** Reset the page number only.
+ @param[in] page_no page number */
+ void set_page_no(uint32_t page_no)
+ {
+ m_id= (m_id & ~uint64_t{0} << 32) | page_no;
+ }
+
+ constexpr ulonglong raw() const { return m_id; }
+
+ /** Flag the page identifier as corrupted. */
+ void set_corrupted() { m_id= ~0ULL; }
+
+ /** @return whether the page identifier belongs to a corrupted page */
+ constexpr bool is_corrupted() const { return m_id == ~0ULL; }
+
+private:
+ /** The page identifier */
+ uint64_t m_id;
+};
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+extern const byte *field_ref_zero;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Latch types */
+enum rw_lock_type_t
+{
+ RW_S_LATCH= 1 << 0,
+ RW_X_LATCH= 1 << 1,
+ RW_SX_LATCH= 1 << 2,
+ RW_NO_LATCH= 1 << 3
+};
+
+#include "sux_lock.h"
+
+#ifdef SUX_LOCK_GENERIC
+class page_hash_latch : private rw_lock
+{
+ /** Wait for a shared lock */
+ void read_lock_wait();
+ /** Wait for an exclusive lock */
+ void write_lock_wait();
+public:
+ /** Acquire a shared lock */
+ inline void lock_shared();
+ /** Acquire an exclusive lock */
+ inline void lock();
+
+ /** @return whether an exclusive lock is being held by any thread */
+ bool is_write_locked() const { return rw_lock::is_write_locked(); }
+
+ /** @return whether any lock is being held by any thread */
+ bool is_locked() const { return rw_lock::is_locked(); }
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); }
+
+ /** Release a shared lock */
+ void unlock_shared() { read_unlock(); }
+ /** Release an exclusive lock */
+ void unlock() { write_unlock(); }
+};
+#elif defined _WIN32 || SIZEOF_SIZE_T >= 8
+class page_hash_latch
+{
+ srw_spin_lock_low lk;
+public:
+ void lock_shared() { lk.rd_lock(); }
+ void unlock_shared() { lk.rd_unlock(); }
+ void lock() { lk.wr_lock(); }
+ void unlock() { lk.wr_unlock(); }
+ bool is_write_locked() const { return lk.is_write_locked(); }
+ bool is_locked() const { return lk.is_locked(); }
+ bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); }
+};
+#else
+class page_hash_latch
+{
+ srw_spin_mutex lk;
+public:
+ void lock_shared() { lock(); }
+ void unlock_shared() { unlock(); }
+ void lock() { lk.wr_lock(); }
+ void unlock() { lk.wr_unlock(); }
+ bool is_locked() const { return lk.is_locked(); }
+ bool is_write_locked() const { return is_locked(); }
+ bool is_locked_or_waiting() const { return is_locked(); }
+};
+#endif
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
new file mode 100644
index 00000000..a5356e0d
--- /dev/null
+++ b/storage/innobase/include/data0data.h
@@ -0,0 +1,704 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "btr0types.h"
+#include <vector>
+
+#include <ostream>
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+struct big_rec_t;
+struct upd_t;
+
+/** Dummy variable to catch access to uninitialized fields. In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+ut_d(extern byte data_error);
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ const dtype_t* type); /*!< in: pointer to data type struct */
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+ MY_ATTRIBUTE((nonnull));
+
+/** Gets spatial status for "external storage"
+@param[in,out] field field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+ const dfield_t* field);
+
+/** Sets spatial status for "external storage"
+@param[in,out] field field
+@param[in] spatial_status spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+ dfield_t* field,
+ spatial_status_t spatial_status);
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+ MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+ dfield_t* field, /*!< in: field */
+ const double* mbr) /*!< in: data */
+ MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field) /*!< in/out: field */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len) /*!< in: SQL null size in bytes */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2); /*!< in: field to copy from */
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap) /*!< in: memory heap where allocated */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2, /*!< in: field */
+ ulint len) /*!< in: maximum prefix to compare,
+ or 0 to compare the whole field length */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data) /*!< in: data */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits) /*!< in: info bits */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp) /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+ MY_ATTRIBUTE((nonnull));
+
+/* Estimate the number of bytes that are going to be allocated when
+creating a new dtuple_t object */
+#define DTUPLE_EST_ALLOC(n_fields) \
+ (sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t))
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out] buf buffer to use
+@param[in] buf_size buffer size
+@param[in] n_fields number of field
+@param[in] n_v_fields number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+ void* buf,
+ ulint buf_size,
+ ulint n_fields,
+ ulint n_v_fields)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created, DTUPLE_EST_ALLOC(n_fields)
+ bytes will be allocated from this heap */
+ ulint n_fields)/*!< in: number of fields */
+ MY_ATTRIBUTE((nonnull, malloc));
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields */
+UNIV_INLINE void dtuple_init_v_fld(dtuple_t* vrow);
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields
+@param[in] heap heap memory to use */
+UNIV_INLINE void dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap);
+
+/** Creates a data tuple with possible virtual columns to a memory heap.
+@param[in] heap memory heap where the tuple is created
+@param[in] n_fields number of fields
+@param[in] n_v_fields number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+ mem_heap_t* heap,
+ ulint n_fields,
+ ulint n_v_fields);
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields) /*!< in: number of fields */
+ MY_ATTRIBUTE((nonnull));
+/** Copies a data tuple's virtaul fields to another. This is a shallow copy;
+@param[in,out] d_tuple destination tuple
+@param[in] s_tuple source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+ dtuple_t* d_tuple,
+ const dtuple_t* s_tuple);
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap) /*!< in: memory heap
+ where the tuple is created */
+ MY_ATTRIBUTE((nonnull, malloc));
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull));
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in] tuple index record
+@param[in] n_fields number of complete fields to fold
+@param[in] n_bytes number of bytes to fold in the last field
+@param[in] index_id index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+ const dtuple_t* tuple,
+ ulint n_fields,
+ ulint n_bytes,
+ index_id_t tree_id)
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n) /*!< in: number of fields to set */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: dtuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dfield_check_typed(
+/*===============*/
+ const dfield_t* field) /*!< in: data field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dtuple_check_typed(
+/*===============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+ibool
+dtuple_validate(
+/*============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+void
+dfield_print(
+/*=========*/
+ const dfield_t* dfield) /*!< in: dfield */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+void
+dfield_print_also_hex(
+/*==================*/
+ const dfield_t* dfield) /*!< in: dfield */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+void
+dtuple_print(
+/*=========*/
+ FILE* f, /*!< in: output stream */
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull));
+
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] field array of data fields
+@param[in] n number of data fields */
+void
+dfield_print(
+ std::ostream& o,
+ const dfield_t* field,
+ ulint n);
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] tuple data tuple */
+void
+dtuple_print(
+ std::ostream& o,
+ const dtuple_t* tuple);
+
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] tuple data tuple */
+inline
+std::ostream&
+operator<<(std::ostream& o, const dtuple_t& tuple)
+{
+ dtuple_print(o, &tuple);
+ return(o);
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ upd_t* upd, /*!< in/out: update vector */
+ dtuple_t* entry, /*!< in/out: index entry */
+ ulint* n_ext) /*!< in/out: number of
+ externally stored columns */
+ MY_ATTRIBUTE((malloc, warn_unused_result));
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: entry whose data was put to vector */
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+ MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+ MY_ATTRIBUTE((nonnull));
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_t{
+ void* data; /*!< pointer to data */
+ unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */
+ unsigned spatial_status:2;
+ /*!< spatial status of externally stored field
+ in undo log for purge */
+ unsigned len; /*!< data length; UNIV_SQL_NULL if SQL null */
+ dtype_t type; /*!< type of data */
+
+ /** Create a deep copy of this object.
+ @param[in,out] heap memory heap in which the clone will be created
+ @return the cloned object */
+ dfield_t* clone(mem_heap_t* heap) const;
+
+ /** @return system field indicates history row */
+ bool vers_history_row() const
+ {
+ ut_ad(type.vers_sys_end());
+ if (type.mtype == DATA_FIXBINARY) {
+ ut_ad(len == sizeof timestamp_max_bytes);
+ return 0 != memcmp(data, timestamp_max_bytes, len);
+ } else {
+ ut_ad(type.mtype == DATA_INT);
+ ut_ad(len == sizeof trx_id_max_bytes);
+ return 0 != memcmp(data, trx_id_max_bytes, len);
+ }
+ ut_ad(0);
+ return false;
+ }
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_t {
+ ulint info_bits; /*!< info bits of an index record:
+ the default is 0; this field is used
+ if an index record is built from
+ a data tuple */
+ ulint n_fields; /*!< number of fields in dtuple */
+ ulint n_fields_cmp; /*!< number of fields which should
+ be used in comparison services
+ of rem0cmp.*; the index search
+ is performed by comparing only these
+ fields, others are ignored; the
+ default value in dtuple creation is
+ the same value as n_fields */
+ dfield_t* fields; /*!< fields */
+ ulint n_v_fields; /*!< number of virtual fields */
+ dfield_t* v_fields; /*!< fields on virtual column */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number, used in
+ debug assertions */
+/** Value of dtuple_t::magic_n */
+# define DATA_TUPLE_MAGIC_N 65478679
+#endif /* UNIV_DEBUG */
+
+ /** Trim the tail of an index tuple before insert or update.
+ After instant ADD COLUMN, if the last fields of a clustered index tuple
+ match the default values that were explicitly specified or implied
+ during ADD COLUMN, there will be no need to store them.
+ NOTE: A page latch in the index must be held, so that the index
+ may not lose 'instantness' before the trimmed tuple has been
+ inserted or updated.
+ @param[in] index index possibly with instantly added columns */
+ void trim(const dict_index_t& index);
+
+ bool vers_history_row() const
+ {
+ for (ulint i = 0; i < n_fields; i++) {
+ const dfield_t* field = &fields[i];
+ if (field->type.vers_sys_end()) {
+ return field->vers_history_row();
+ }
+ }
+ return false;
+ }
+
+ /**
+ @param info_bits the info_bits of a data tuple
+ @return whether this is a hidden metadata record
+ for instant ADD COLUMN or ALTER TABLE */
+ static bool is_alter_metadata(ulint info_bits)
+ {
+ return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER);
+ }
+
+ /**
+ @param info_bits the info_bits of a data tuple
+ @return whether this is a hidden metadata record
+ for instant ADD COLUMN or ALTER TABLE */
+ static bool is_metadata(ulint info_bits)
+ {
+ return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_METADATA_ADD);
+ }
+
+ /** @return whether this is a hidden metadata record
+ for instant ALTER TABLE (not only ADD COLUMN) */
+ bool is_alter_metadata() const { return is_alter_metadata(info_bits); }
+
+ /** @return whether this is a hidden metadata record
+ for instant ADD COLUMN or ALTER TABLE */
+ bool is_metadata() const { return is_metadata(info_bits); }
+
+ /** Copy type information from index fields.
+ @param index index field to be copied */
+ inline void copy_field_types(const dict_index_t &index);
+};
+
+inline ulint dtuple_get_n_fields(const dtuple_t* tuple)
+{ return tuple->n_fields; }
+inline dtype_t* dfield_get_type(dfield_t* field) { return &field->type; }
+inline const dtype_t* dfield_get_type(const dfield_t* field)
+{ return &field->type; }
+inline void* dfield_get_data(dfield_t* field)
+{
+ ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+ return field->data;
+}
+inline const void* dfield_get_data(const dfield_t* field)
+{
+ ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+ return field->data;
+}
+inline ulint dfield_get_len(const dfield_t* field) {
+ ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+ ut_ad(field->len != UNIV_SQL_DEFAULT);
+ return field->len;
+}
+inline bool dfield_is_null(const dfield_t* field)
+{ return field->len == UNIV_SQL_NULL; }
+/** @return whether a column is to be stored off-page */
+inline bool dfield_is_ext(const dfield_t* field)
+{
+ ut_ad(!field->ext || field->len >= BTR_EXTERN_FIELD_REF_SIZE);
+ return static_cast<bool>(field->ext);
+}
+/** Set the "external storage" flag */
+inline void dfield_set_ext(dfield_t* field) { field->ext = 1; }
+
+/** Gets number of virtual fields in a data tuple.
+@param[in] tuple dtuple to check
+@return number of fields */
+inline ulint
+dtuple_get_n_v_fields(const dtuple_t* tuple) { return tuple->n_v_fields; }
+
+inline const dfield_t* dtuple_get_nth_field(const dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_fields);
+ return &tuple->fields[n];
+}
+inline dfield_t* dtuple_get_nth_field(dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_fields);
+ return &tuple->fields[n];
+}
+
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in] tuple tuple
+@oaran[in] n the nth virtual field to get
+@return nth virtual field */
+inline const dfield_t* dtuple_get_nth_v_field(const dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_v_fields);
+ return &tuple->v_fields[n];
+}
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in] tuple tuple
+@oaran[in] n the nth virtual field to get
+@return nth virtual field */
+inline dfield_t* dtuple_get_nth_v_field(dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_v_fields);
+ return &tuple->v_fields[n];
+}
+
+/** A slot for a field in a big rec vector */
+struct big_rec_field_t {
+
+ /** Constructor.
+ @param[in] field_no_ the field number
+ @param[in] len_ the data length
+ @param[in] data_ the data */
+ big_rec_field_t(ulint field_no_, ulint len_, const void* data_)
+ : field_no(field_no_),
+ len(len_),
+ data(data_)
+ {}
+
+ ulint field_no; /*!< field number in record */
+ ulint len; /*!< stored data length, in bytes */
+ const void* data; /*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_t {
+ mem_heap_t* heap; /*!< memory heap from which
+ allocated */
+ const ulint capacity; /*!< fields array size */
+ ulint n_fields; /*!< number of stored fields */
+ big_rec_field_t*fields; /*!< stored fields */
+
+ /** Constructor.
+ @param[in] max the capacity of the array of fields. */
+ explicit big_rec_t(const ulint max)
+ : heap(0),
+ capacity(max),
+ n_fields(0),
+ fields(0)
+ {}
+
+ /** Append one big_rec_field_t object to the end of array of fields */
+ void append(const big_rec_field_t& field)
+ {
+ ut_ad(n_fields < capacity);
+ fields[n_fields] = field;
+ n_fields++;
+ }
+
+ /** Allocate a big_rec_t object in the given memory heap, and for
+ storing n_fld number of fields.
+ @param[in] heap memory heap in which this object is allocated
+ @param[in] n_fld maximum number of fields that can be stored in
+ this object
+ @return the allocated object */
+ static big_rec_t* alloc(
+ mem_heap_t* heap,
+ ulint n_fld);
+};
+
+#include "data0data.inl"
+
+#endif
diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl
new file mode 100644
index 00000000..2d1bf5a2
--- /dev/null
+++ b/storage/innobase/include/data0data.inl
@@ -0,0 +1,633 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ const dtype_t* type) /*!< in: pointer to data type struct */
+{
+ ut_ad(field != NULL);
+ ut_ad(type != NULL);
+
+ field->type = *type;
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ field->ext = 0;
+ field->len = static_cast<unsigned int>(len);
+}
+
+/** Gets spatial status for "external storage"
+@param[in,out] field field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+ const dfield_t* field)
+{
+ ut_ad(dfield_is_ext(field));
+
+ return(static_cast<spatial_status_t>(field->spatial_status));
+}
+
+/** Sets spatial status for "external storage"
+@param[in,out] field field
+@param[in] spatial_status spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+ dfield_t* field,
+ spatial_status_t spatial_status)
+{
+ field->spatial_status = spatial_status & 3;
+ ut_ad(dfield_get_spatial_status(field) == spatial_status);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ field->data = (void*) data;
+ field->ext = 0;
+ field->len = static_cast<unsigned int>(len);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+ dfield_t* field, /*!< in: field */
+ const double* mbr) /*!< in: data */
+{
+ MEM_CHECK_DEFINED(mbr, sizeof *mbr);
+ field->ext = 0;
+
+ for (unsigned i = 0; i < SPDIMS * 2; i++) {
+ mach_double_write(static_cast<byte*>(field->data)
+ + i * sizeof(double), mbr[i]);
+ }
+
+ field->len = DATA_MBR_LEN;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field) /*!< in/out: field */
+{
+ dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ ut_ad(field1 != NULL);
+ ut_ad(field2 != NULL);
+
+ field1->data = field2->data;
+ field1->len = field2->len;
+ field1->ext = field2->ext;
+ field1->spatial_status = field2->spatial_status;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ *field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap) /*!< in: memory heap where allocated */
+{
+ if (!dfield_is_null(field)) {
+ MEM_CHECK_DEFINED(field->data, field->len);
+ field->data = mem_heap_dup(heap, field->data, field->len);
+ }
+}
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2, /*!< in: field */
+ ulint len) /*!< in: maximum prefix to compare,
+ or 0 to compare the whole field length */
+{
+ ulint len2 = len;
+
+ if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) {
+ len = field1->len;
+ }
+
+ if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) {
+ len2 = field2->len;
+ }
+
+ return(len == len2
+ && (len == UNIV_SQL_NULL
+ || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data) /*!< in: data */
+{
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ return(len == dfield_get_len(field)
+ && (!len || len == UNIV_SQL_NULL
+ || !memcmp(dfield_get_data(field), data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits) /*!< in: info bits */
+{
+ tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp) /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+{
+ ut_ad(n_fields_cmp <= tuple->n_fields);
+ tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out] buf buffer to use
+@param[in] buf_size buffer size
+@param[in] n_fields number of field
+@param[in] n_v_fields number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+ void* buf,
+ ulint buf_size,
+ ulint n_fields,
+ ulint n_v_fields)
+{
+ dtuple_t* tuple;
+ ulint n_t_fields = n_fields + n_v_fields;
+
+ ut_a(buf_size >= DTUPLE_EST_ALLOC(n_t_fields));
+
+ tuple = (dtuple_t*) buf;
+ tuple->info_bits = 0;
+ tuple->n_fields = n_fields;
+ tuple->n_v_fields = n_v_fields;
+ tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*) &tuple[1];
+ if (n_v_fields > 0) {
+ tuple->v_fields = &tuple->fields[n_fields];
+ } else {
+ tuple->v_fields = NULL;
+ }
+
+#ifdef UNIV_DEBUG
+ tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+ { /* In the debug version, initialize fields to an error value */
+ ulint i;
+
+ for (i = 0; i < n_t_fields; i++) {
+ dfield_t* field;
+
+ if (i >= n_fields) {
+ field = dtuple_get_nth_v_field(
+ tuple, i - n_fields);
+ } else {
+ field = dtuple_get_nth_field(tuple, i);
+ }
+
+ dfield_set_len(field, UNIV_SQL_NULL);
+ field->data = &data_error;
+ dfield_get_type(field)->mtype = DATA_ERROR;
+ dfield_get_type(field)->prtype = DATA_ERROR;
+ }
+ }
+#endif
+ MEM_CHECK_ADDRESSABLE(tuple->fields, n_t_fields
+ * sizeof *tuple->fields);
+ MEM_UNDEFINED(tuple->fields, n_t_fields * sizeof *tuple->fields);
+ return(tuple);
+}
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields
+@param[in,out] heap heap memory to use */
+UNIV_INLINE
+void
+dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap)
+{
+ for (ulint i = 0; i < vrow->n_v_fields; i++) {
+ dfield_t* dfield = dtuple_get_nth_v_field(vrow, i);
+ dfield_dup(dfield, heap);
+ }
+}
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields */
+UNIV_INLINE
+void
+dtuple_init_v_fld(dtuple_t* vrow)
+{
+ for (ulint i = 0; i < vrow->n_v_fields; i++) {
+ dfield_t* dfield = dtuple_get_nth_v_field(vrow, i);
+ dfield_get_type(dfield)->mtype = DATA_MISSING;
+ dfield_set_len(dfield, UNIV_SQL_NULL);
+ }
+}
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created, DTUPLE_EST_ALLOC(n_fields)
+ bytes will be allocated from this heap */
+ ulint n_fields) /*!< in: number of fields */
+{
+ return(dtuple_create_with_vcol(heap, n_fields, 0));
+}
+
+/** Creates a data tuple with virtual columns to a memory heap.
+@param[in] heap memory heap where the tuple is created
+@param[in] n_fields number of fields
+@param[in] n_v_fields number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+ mem_heap_t* heap,
+ ulint n_fields,
+ ulint n_v_fields)
+{
+ void* buf;
+ ulint buf_size;
+ dtuple_t* tuple;
+
+ ut_ad(heap);
+
+ buf_size = DTUPLE_EST_ALLOC(n_fields + n_v_fields);
+ buf = mem_heap_alloc(heap, buf_size);
+
+ tuple = dtuple_create_from_mem(buf, buf_size, n_fields, n_v_fields);
+
+ return(tuple);
+}
+
+/** Copies a data tuple's virtual fields to another. This is a shallow copy;
+@param[in,out] d_tuple destination tuple
+@param[in] s_tuple source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+ dtuple_t* d_tuple,
+ const dtuple_t* s_tuple)
+{
+
+ ulint n_v_fields = dtuple_get_n_v_fields(d_tuple);
+ ut_ad(n_v_fields == dtuple_get_n_v_fields(s_tuple));
+
+ for (ulint i = 0; i < n_v_fields; i++) {
+ dfield_copy(dtuple_get_nth_v_field(d_tuple, i),
+ dtuple_get_nth_v_field(s_tuple, i));
+ }
+}
+
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap) /*!< in: memory heap
+ where the tuple is created */
+{
+ ulint n_fields = dtuple_get_n_fields(tuple);
+ ulint n_v_fields = dtuple_get_n_v_fields(tuple);
+ dtuple_t* new_tuple = dtuple_create_with_vcol(
+ heap, n_fields, n_v_fields);
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_copy(dtuple_get_nth_field(new_tuple, i),
+ dtuple_get_nth_field(tuple, i));
+ }
+
+ for (i = 0; i < n_v_fields; i++) {
+ dfield_copy(dtuple_get_nth_v_field(new_tuple, i),
+ dtuple_get_nth_v_field(tuple, i));
+ }
+
+ return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint len;
+ ulint i;
+ ulint sum = 0;
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ n_fields = tuple->n_fields;
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+ len = dfield_get_len(field);
+
+ if (len == UNIV_SQL_NULL) {
+ len = dtype_get_sql_null_size(dfield_get_type(field),
+ comp);
+ }
+
+ sum += len;
+ }
+
+ return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ulint n_ext = 0;
+ ulint n_fields = tuple->n_fields;
+ ulint i;
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ for (i = 0; i < n_fields; i++) {
+ n_ext += dtuple_get_nth_field(tuple, i)->ext;
+ }
+
+ return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n) /*!< in: number of fields to set */
+{
+ dtype_t* dfield_type;
+ ulint i;
+
+ for (i = 0; i < n; i++) {
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dtype_set(dfield_type, DATA_BINARY, 0, 0);
+ }
+}
+
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in] tuple index record
+@param[in] n_fields number of complete fields to fold
+@param[in] n_bytes number of bytes to fold in the last field
+@param[in] index_id index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+ const dtuple_t* tuple,
+ ulint n_fields,
+ ulint n_bytes,
+ index_id_t tree_id)
+{
+ const dfield_t* field;
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+
+ ut_ad(tuple);
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple));
+
+ fold = ut_fold_ull(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len) /*!< in: SQL null size in bytes */
+{
+ memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: dtuple */
+{
+ ulint n;
+ ulint i;
+
+ n = dtuple_get_n_fields(tuple);
+
+ for (i = 0; i < n; i++) {
+ if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+{
+ mem_heap_free(vector->heap);
+}
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
new file mode 100644
index 00000000..3d63ddb7
--- /dev/null
+++ b/storage/innobase/include/data0type.h
@@ -0,0 +1,591 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+
+/** Special length indicating a missing instantly added column */
+#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1)
+
+/** @return whether a length is actually stored in a field */
+#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT)
+
+extern ulint data_mysql_default_charset_coll;
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+struct dtype_t;
+
+/** SQL Like operator comparison types */
+enum ib_like_t {
+ IB_LIKE_EXACT, /**< e.g. STRING */
+ IB_LIKE_PREFIX /**< e.g., STRING% */
+};
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_MISSING 0 /* missing column */
+#define DATA_VARCHAR 1 /* character varying of the
+ latin1_swedish_ci charset-collation; note
+ that the MySQL format for this, DATA_BINARY,
+ DATA_VARMYSQL, is also affected by whether the
+ 'precise type' contains
+ DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR 2 /* fixed length character of the
+ latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY 3 /* binary string of fixed length */
+#define DATA_BINARY 4 /* binary string */
+#define DATA_BLOB 5 /* binary large object, or a TEXT type;
+ if prtype & DATA_BINARY_TYPE == 0, then this is
+ actually a TEXT column (or a BLOB created
+ with < 4.0.14; since column prefix indexes
+ came only in 4.0.14, the missing flag in BLOBs
+ created before that does not cause any harm) */
+#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */
+#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */
+#define DATA_SYS 8 /* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT 9
+#define DATA_DOUBLE 10
+#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */
+#define DATA_VARMYSQL 12 /* any charset varying length char */
+#define DATA_MYSQL 13 /* any charset fixed length char */
+ /* NOTE that 4.1.1 used DATA_MYSQL and
+ DATA_VARMYSQL for all character sets, and the
+ charset-collation for tables created with it
+ can also be latin1_swedish_ci */
+
+/* DATA_GEOMETRY includes all standard geometry datatypes as described in
+OGC standard(point, line_string, polygon, multi_point, multi_polygon,
+multi_line_string, geometry_collection, geometry).
+Currently, geometry data is stored in the standard Well-Known Binary(WKB)
+format (http://www.opengeospatial.org/standards/sfa).
+We use BLOB as the underlying datatype. */
+#define DATA_GEOMETRY 14 /* geometry datatype of variable length */
+#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size()
+ requires the values are <= 63 */
+
+#define DATA_MTYPE_CURRENT_MIN DATA_VARCHAR /* minimum value of mtype */
+#define DATA_MTYPE_CURRENT_MAX DATA_GEOMETRY /* maximum value of mtype */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH 4 /* English language character string: this
+ is a relic from pre-MySQL time and only used
+ for InnoDB's own system tables */
+#define DATA_ERROR 111 /* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255U/* AND with this mask to extract the MySQL
+ type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+ format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define DATA_ROW_ID 0 /* row id: a 48-bit integer */
+#define DATA_ROW_ID_LEN 6 /* stored length for row id */
+
+#define DATA_TRX_ID 1 /* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN 6
+
+#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define DATA_N_SYS_COLS 3 /* number of system columns defined above */
+
+#define DATA_FTS_DOC_ID 3 /* Used as FTS DOC ID column */
+
+#define DATA_SYS_PRTYPE_MASK 0xFU /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL 256U /* this is ORed to the precise type when
+ the column is declared as NOT NULL */
+#define DATA_UNSIGNED 512U /* this id ORed to the precise type when
+ we have an unsigned integer type */
+#define DATA_BINARY_TYPE 1024U /* if the data type is a binary character
+ string, this is ORed to the precise type:
+ this only holds for tables created with
+ >= MySQL-4.0.14 */
+/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1.
+ In earlier versions this was set for some
+ BLOB columns.
+*/
+#define DATA_GIS_MBR 2048U /* Used as GIS MBR column */
+/** the size of a GIS maximum bounding rectangle */
+constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
+
+#define DATA_LONG_TRUE_VARCHAR 4096U /* this is ORed to the precise data
+ type when the column is true VARCHAR where
+ MySQL uses 2 bytes to store the data len;
+ for shorter VARCHARs MySQL uses only 1 byte */
+#define DATA_VIRTUAL 8192U /* Virtual column */
+
+/** System Versioning */
+#define DATA_VERS_START 16384U /* start system field */
+#define DATA_VERS_END 32768U /* end system field */
+/** system-versioned user data column */
+#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END)
+
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6
+
+/* Maximum multi-byte character length in bytes, plus 1 */
+#define DATA_MBMAX 8
+
+/* For checking if mtype is GEOMETRY datatype */
+#define DATA_GEOMETRY_MTYPE(mtype) ((mtype) == DATA_GEOMETRY)
+
+/* For checking if mtype is BLOB or GEOMETRY, since we use BLOB as
+the underlying datatype of GEOMETRY data. */
+#define DATA_LARGE_MTYPE(mtype) ((mtype) == DATA_BLOB \
+ || (mtype) == DATA_GEOMETRY)
+
+/* For checking if data type is big length data type. */
+#define DATA_BIG_LEN_MTYPE(len, mtype) ((len) > 255 || DATA_LARGE_MTYPE(mtype))
+
+/* For checking if the column is a big length column. */
+#define DATA_BIG_COL(col) DATA_BIG_LEN_MTYPE((col)->len, (col)->mtype)
+
+/* For checking if data type is large binary data type. */
+#define DATA_LARGE_BINARY(mtype,prtype) ((mtype) == DATA_GEOMETRY || \
+ ((mtype) == DATA_BLOB && !((prtype) & DATA_BINARY_TYPE)))
+
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM 32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK MAX_CHAR_COLL_NUM
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type); /*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of
+ a multi-byte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of
+ a multi-byte character, in bytes */
+ ulint prefix_len, /*!< in: length of the requested
+ prefix, in characters, multiplied by
+ dtype_get_mbmaxlen(dtype) */
+ ulint data_len, /*!< in: length of str (in bytes) */
+ const char* str); /*!< in: the string whose prefix
+ length is being determined */
+/** @return whether main type is a string type */
+inline bool dtype_is_string_type(ulint mtype)
+{
+ return mtype <= DATA_BLOB
+ || mtype == DATA_MYSQL || mtype == DATA_VARMYSQL;
+}
+
+/** @return whether a type is a binary string type */
+inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype)
+{
+ /* Note that for tables created before MySQL 4.0.14,
+ we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+ For those DATA_BLOB columns we return false. */
+
+ return mtype == DATA_FIXBINARY || mtype == DATA_BINARY
+ || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE));
+}
+
+/** @return whether a type is a non-binary string type */
+inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype)
+{
+ return dtype_is_string_type(mtype)
+ && !dtype_is_binary_string_type(mtype, prtype);
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len); /*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2); /*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type); /*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type); /*!< in: data type */
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ unsigned* mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ unsigned* mbmaxlen); /*!< out: maximum length of a
+ multi-byte character */
+/**
+Get the charset-collation code for string types.
+@param prtype InnoDB precise type
+@return charset-collation code */
+inline uint16_t dtype_get_charset_coll(ulint prtype)
+{
+ return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK;
+}
+
+/** Form a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@param[in] old_prtype MySQL type code and the flags
+ DATA_BINARY_TYPE etc.
+@param[in] charset_coll character-set collation code
+@return precise type, including the charset-collation code */
+UNIV_INLINE
+uint32_t
+dtype_form_prtype(ulint old_prtype, ulint charset_coll)
+{
+ ut_ad(old_prtype < 256 * 256);
+ ut_ad(charset_coll <= MAX_CHAR_COLL_NUM);
+ return(uint32_t(old_prtype + (charset_coll << 16)));
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type); /*!< in: data type */
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a
+ multibyte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multibyte character, in bytes */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a character */
+ ulint mbmaxlen); /*!< in: maximum length of a character */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len); /*!< in: length */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len);/*!< in: prefix length to
+ replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+ibool
+dtype_validate(
+/*===========*/
+ const dtype_t* type); /*!< in: type struct to validate */
+#ifdef UNIV_DEBUG
+/** Print a data type structure.
+@param[in] type data type */
+void
+dtype_print(
+ const dtype_t* type);
+#endif /* UNIV_DEBUG */
+
+struct dict_col_t;
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_t{
+ unsigned prtype:32; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+ unsigned mtype:8; /*!< main data type */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+ unsigned mbminlen:3; /*!< minimum length of a character,
+ in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a character,
+ in bytes */
+
+ /** @return whether this is system versioned user field */
+ bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+ /** @return whether this is the system field start */
+ bool vers_sys_start() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+ }
+ /** @return whether this is the system field end */
+ bool vers_sys_end() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+ }
+
+ /** Set the type of the BLOB in the hidden metadata record. */
+ void metadata_blob_init()
+ {
+ prtype = DATA_NOT_NULL;
+ mtype = DATA_BLOB;
+ len = 0;
+ mbminlen = 0;
+ mbmaxlen = 0;
+ }
+
+ /** Copy the type information from a column.
+ @param col column type to be copied */
+ void assign(const dict_col_t &col);
+};
+
+/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
+extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+/** Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG 0x10UL
+/** The delete-mark flag in info bits */
+#define REC_INFO_DELETED_FLAG 0x20UL
+
+/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */
+enum rec_comp_status_t {
+ /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+ REC_STATUS_ORDINARY = 0,
+ /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+ REC_STATUS_NODE_PTR = 1,
+ /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */
+ REC_STATUS_INFIMUM = 2,
+ /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */
+ REC_STATUS_SUPREMUM = 3,
+ /** Clustered index record that has been inserted or updated
+ after instant ADD COLUMN (more than dict_index_t::n_core_fields) */
+ REC_STATUS_INSTANT = 4
+};
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN.
+@see rec_is_metadata()
+@see rec_is_alter_metadata() */
+static const byte REC_INFO_METADATA_ADD
+ = REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT;
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE.
+@see rec_is_metadata() */
+static const byte REC_INFO_METADATA_ALTER
+ = REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG;
+
+#include "data0type.inl"
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
new file mode 100644
index 00000000..329cee5d
--- /dev/null
+++ b/storage/innobase/include/data0type.inl
@@ -0,0 +1,487 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ha_prototypes.h"
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype) /*!< in: precise data type */
+{
+ /* These codes have been copied from strings/ctype-extra.c
+ and strings/ctype-utf8.c. */
+ switch (dtype_get_charset_coll(prtype)) {
+ case 11: /* ascii_general_ci */
+ case 65: /* ascii_bin */
+ case 33: /* utf8_general_ci */
+ case 83: /* utf8_bin */
+ case 254: /* utf8_general_cs */
+ return true;
+ }
+
+ return false;
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type) /*!< in: type struct */
+{
+ return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+ dtype_t* type) /*!< in/out: type */
+{
+ unsigned mbminlen, mbmaxlen;
+
+ dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+ type->mbminlen = mbminlen & 7;
+ type->mbmaxlen = mbmaxlen & 7;
+
+ ut_ad(dtype_validate(type));
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision of type */
+{
+ ut_ad(type);
+ ut_ad(mtype <= DATA_MTYPE_MAX);
+
+ type->mtype = static_cast<byte>(mtype);
+ type->prtype = static_cast<unsigned>(prtype);
+ type->len = static_cast<uint16_t>(len);
+
+ dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2) /*!< in: type struct to copy from */
+{
+ *type1 = *type2;
+
+ ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->len);
+}
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ return type->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ return type->mbmaxlen;
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len)/*!< in: prefix length to
+ replace type->len, or 0 */
+{
+ compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ ulint len;
+
+ ut_ad(type);
+ ut_ad(type->mtype >= DATA_VARCHAR);
+ ut_ad(type->mtype <= DATA_MTYPE_MAX);
+
+ buf[0] = (byte)(type->mtype & 0xFFUL);
+
+ if (type->prtype & DATA_BINARY_TYPE) {
+ buf[0] |= 128;
+ }
+
+ /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) {
+ buf[0] |= 64;
+ }
+ */
+
+ buf[1] = (byte)(type->prtype & 0xFFUL);
+
+ len = prefix_len ? prefix_len : type->len;
+
+ mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+ ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
+ mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+ if (type->prtype & DATA_NOT_NULL) {
+ buf[4] |= 128;
+ }
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+ compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ type->prtype = dtype_form_prtype(type->prtype,
+ data_mysql_default_charset_coll);
+ dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+ compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
+
+ if (buf[4] & 128) {
+ type->prtype |= DATA_NOT_NULL;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
+
+ if (dtype_is_string_type(type->mtype)) {
+ ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
+
+ if (charset_coll == 0) {
+ /* This insert buffer record was inserted with MySQL
+ version < 4.1.2, and the charset-collation code was not
+ explicitly stored to dtype->prtype at that time. It
+ must be the default charset-collation of this MySQL
+ installation. */
+
+ charset_coll = data_mysql_default_charset_coll;
+ }
+
+ type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+ }
+ dtype_set_mblen(type);
+}
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a
+ multibyte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multibyte character, in bytes */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ /* fall through */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return static_cast<unsigned>(len);
+ case DATA_MYSQL:
+ if (prtype & DATA_BINARY_TYPE) {
+ return static_cast<unsigned>(len);
+ } else if (!comp) {
+ return static_cast<unsigned>(len);
+ } else {
+ if (mbminlen == mbmaxlen) {
+ return static_cast<unsigned>(len);
+ }
+ }
+ /* Treat as variable-length. */
+ /* fall through */
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a character */
+ ulint mbmaxlen) /*!< in: maximum length of a character */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ /* fall through */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return static_cast<unsigned>(len);
+ case DATA_MYSQL:
+ if (prtype & DATA_BINARY_TYPE) {
+ return static_cast<unsigned>(len);
+ } else {
+ if (mbminlen == mbmaxlen) {
+ return static_cast<unsigned>(len);
+ }
+
+ /* this is a variable-length character set */
+ ut_a(mbminlen > 0);
+ ut_a(mbmaxlen > mbminlen);
+ ut_a(len % mbmaxlen == 0);
+ return static_cast<unsigned>(
+ len * mbminlen / mbmaxlen);
+ }
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len) /*!< in: length */
+{
+ switch (mtype) {
+ case DATA_SYS:
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_MYSQL:
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ return(len);
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ULINT_MAX);
+}
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+ type->mbminlen, type->mbmaxlen, comp));
+}
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
new file mode 100644
index 00000000..bcd6b8bc
--- /dev/null
+++ b/storage/innobase/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+struct dfield_t;
+
+/* SQL data tuple struct */
+struct dtuple_t;
+
+#endif
+
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
new file mode 100644
index 00000000..64182aab
--- /dev/null
+++ b/storage/innobase/include/db0err.h
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+enum dberr_t {
+ DB_SUCCESS,
+
+ DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new
+ explicit record lock was created */
+
+ /* The following are error codes */
+ DB_ERROR = 11,
+ DB_INTERRUPTED,
+ DB_OUT_OF_MEMORY,
+ DB_OUT_OF_FILE_SPACE,
+ DB_LOCK_WAIT,
+ DB_DEADLOCK,
+ DB_ROLLBACK,
+ DB_DUPLICATE_KEY,
+ DB_MISSING_HISTORY, /*!< required history data has been
+ deleted due to lack of space in
+ rollback segment */
+ DB_CLUSTER_NOT_FOUND = 30,
+ DB_TABLE_NOT_FOUND,
+ DB_TOO_BIG_RECORD, /*!< a record in an index would not fit
+ on a compressed page, or it would
+ become bigger than 1/2 free space in
+ an uncompressed page frame */
+ DB_LOCK_WAIT_TIMEOUT, /*!< lock wait lasted too long */
+ DB_NO_REFERENCED_ROW, /*!< referenced key value not found
+ for a foreign key in an insert or
+ update of a row */
+ DB_ROW_IS_REFERENCED, /*!< cannot delete or update a row
+ because it contains a key value
+ which is referenced */
+ DB_CANNOT_ADD_CONSTRAINT, /*!< adding a foreign key constraint
+ to a table failed */
+ DB_CORRUPTION, /*!< data structure corruption
+ noticed */
+ DB_CANNOT_DROP_CONSTRAINT, /*!< dropping a foreign key constraint
+ from a table failed */
+ DB_NO_SAVEPOINT, /*!< no savepoint exists with the given
+ name */
+ DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table
+ tablespace because a file of the same
+ name already exists */
+ DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is
+ being dropped right now */
+ DB_TABLESPACE_NOT_FOUND, /*<! Attempt to delete a tablespace
+ instance that was not found in the
+ tablespace hash table */
+ DB_LOCK_TABLE_FULL, /*!< lock structs have exhausted the
+ buffer pool (for big transactions,
+ InnoDB stores the lock structs in the
+ buffer pool) */
+ DB_FOREIGN_DUPLICATE_KEY, /*!< foreign key constraints
+ activated by the operation would
+ lead to a duplicate key in some
+ table */
+ DB_TOO_MANY_CONCURRENT_TRXS, /*!< when InnoDB runs out of the
+ preconfigured undo slots, this can
+ only happen when there are too many
+ concurrent transactions */
+ DB_UNSUPPORTED, /*!< when InnoDB sees any artefact or
+ a feature that it can't recoginize or
+ work with e.g., FT indexes created by
+ a later version of the engine. */
+
+ DB_INVALID_NULL, /*!< a NOT NULL column was found to
+ be NULL during table rebuild */
+
+ DB_STATS_DO_NOT_EXIST, /*!< an operation that requires the
+ persistent storage, used for recording
+ table and index statistics, was
+ requested but this storage does not
+ exist itself or the stats for a given
+ table do not exist */
+ DB_FOREIGN_EXCEED_MAX_CASCADE, /*!< Foreign key constraint related
+ cascading delete/update exceeds
+ maximum allowed depth */
+ DB_CHILD_NO_INDEX, /*!< the child (foreign) table does
+ not have an index that contains the
+ foreign keys as its prefix columns */
+ DB_PARENT_NO_INDEX, /*!< the parent table does not
+ have an index that contains the
+ foreign keys as its prefix columns */
+ DB_TOO_BIG_INDEX_COL, /*!< index column size exceeds
+ maximum limit */
+ DB_INDEX_CORRUPT, /*!< we have corrupted index */
+ DB_UNDO_RECORD_TOO_BIG, /*!< the undo log record is too big */
+ DB_READ_ONLY, /*!< Update operation attempted in
+ a read-only transaction */
+ DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */
+ DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big
+ during online index creation */
+
+ DB_IDENTIFIER_TOO_LONG, /*!< Identifier name too long */
+ DB_FTS_EXCEED_RESULT_CACHE_LIMIT, /*!< FTS query memory
+ exceeds result cache limit */
+ DB_TEMP_FILE_WRITE_FAIL, /*!< Temp file write failure */
+ DB_CANT_CREATE_GEOMETRY_OBJECT, /*!< Cannot create specified Geometry
+ data object */
+ DB_CANNOT_OPEN_FILE, /*!< Cannot open a file */
+ DB_FTS_TOO_MANY_WORDS_IN_PHRASE,
+ /*< Too many words in a phrase */
+
+ DB_DECRYPTION_FAILED, /* Tablespace encrypted and
+ decrypt operation failed because
+ of missing key management plugin,
+ or missing or incorrect key or
+ incorret AES method or algorithm. */
+
+ DB_IO_ERROR = 100, /*!< Generic IO error */
+
+ DB_IO_PARTIAL_FAILED, /*!< Partial IO request failed */
+
+ DB_TABLE_CORRUPT, /*!< Table/clustered index is
+ corrupted */
+
+ DB_COMPUTE_VALUE_FAILED, /*!< Compute generated value failed */
+
+ DB_NO_FK_ON_S_BASE_COL, /*!< Cannot add foreign constrain
+ placed on the base column of
+ stored column */
+
+ DB_IO_NO_PUNCH_HOLE, /*!< Punch hole not supported by
+ file system. */
+
+ DB_PAGE_CORRUPTED, /* Page read from tablespace is
+ corrupted. */
+ /* The following are partial failure codes */
+ DB_FAIL = 1000,
+ DB_OVERFLOW,
+ DB_UNDERFLOW,
+ DB_STRONG_FAIL,
+ DB_ZIP_OVERFLOW,
+ DB_RECORD_NOT_FOUND = 1500,
+ DB_END_OF_INDEX,
+ DB_NOT_FOUND, /*!< Generic error code for "Not found"
+ type of errors */
+};
+
+#endif
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
new file mode 100644
index 00000000..a6528747
--- /dev/null
+++ b/storage/innobase/include/dict0boot.h
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+void
+dict_hdr_get_new_id(
+/*================*/
+ table_id_t* table_id, /*!< out: table id
+ (not assigned if NULL) */
+ index_id_t* index_id, /*!< out: index id
+ (not assigned if NULL) */
+ uint32_t* space_id); /*!< out: space id
+ (not assigned if NULL) */
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id);
+/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+inline row_id_t dict_sys_t::get_new_row_id()
+{
+ row_id_t id= row_id.fetch_add(1);
+ if (!(id % ROW_ID_WRITE_MARGIN))
+ dict_hdr_flush_row_id(id);
+ return id;
+}
+
+/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+inline void dict_sys_t::update_row_id(row_id_t id)
+{
+ row_id_t sys_id= row_id;
+ while (id >= sys_id)
+ {
+ if (!row_id.compare_exchange_strong(sys_id, id))
+ continue;
+ if (!(id % ROW_ID_WRITE_MARGIN))
+ dict_hdr_flush_row_id(id);
+ break;
+ }
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+inline void dict_sys_write_row_id(byte *field, row_id_t row_id)
+{
+ static_assert(DATA_ROW_ID_LEN == 6, "compatibility");
+ mach_write_to_6(field, row_id);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_boot(void)
+/*===========*/
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_create(void)
+/*=============*/
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a table id belongs to system table.
+@return true if the table id belongs to a system table. */
+inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; }
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID 1
+#define DICT_COLUMNS_ID 2
+#define DICT_INDEXES_ID dict_index_t::DICT_INDEXES_ID /* 3 */
+#define DICT_FIELDS_ID 4
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID 5
+
+/* The offset of the dictionary header on the page */
+#define DICT_HDR FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */
+#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */
+#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES 32 /* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS 36 /* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS 40 /* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES 44 /* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS 48 /* Root of SYS_FIELDS clust index */
+
+#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace
+ segment into which the dictionary
+ header is created */
+/*-------------------------------------------------------------*/
+
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+ DICT_COL__SYS_TABLES__NAME = 0,
+ DICT_COL__SYS_TABLES__ID = 1,
+ DICT_COL__SYS_TABLES__N_COLS = 2,
+ DICT_COL__SYS_TABLES__TYPE = 3,
+ DICT_COL__SYS_TABLES__MIX_ID = 4,
+ DICT_COL__SYS_TABLES__MIX_LEN = 5,
+ DICT_COL__SYS_TABLES__CLUSTER_ID = 6,
+ DICT_COL__SYS_TABLES__SPACE = 7,
+ DICT_NUM_COLS__SYS_TABLES = 8
+};
+/* The field numbers in the SYS_TABLES clustered index */
+enum dict_fld_sys_tables_enum {
+ DICT_FLD__SYS_TABLES__NAME = 0,
+ DICT_FLD__SYS_TABLES__DB_TRX_ID = 1,
+ DICT_FLD__SYS_TABLES__DB_ROLL_PTR = 2,
+ DICT_FLD__SYS_TABLES__ID = 3,
+ DICT_FLD__SYS_TABLES__N_COLS = 4,
+ DICT_FLD__SYS_TABLES__TYPE = 5,
+ DICT_FLD__SYS_TABLES__MIX_ID = 6,
+ DICT_FLD__SYS_TABLES__MIX_LEN = 7,
+ DICT_FLD__SYS_TABLES__CLUSTER_ID = 8,
+ DICT_FLD__SYS_TABLES__SPACE = 9,
+ DICT_NUM_FIELDS__SYS_TABLES = 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+ DICT_FLD__SYS_TABLE_IDS__ID = 0,
+ DICT_FLD__SYS_TABLE_IDS__NAME = 1,
+ DICT_NUM_FIELDS__SYS_TABLE_IDS = 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+ DICT_COL__SYS_COLUMNS__TABLE_ID = 0,
+ DICT_COL__SYS_COLUMNS__POS = 1,
+ DICT_COL__SYS_COLUMNS__NAME = 2,
+ DICT_COL__SYS_COLUMNS__MTYPE = 3,
+ DICT_COL__SYS_COLUMNS__PRTYPE = 4,
+ DICT_COL__SYS_COLUMNS__LEN = 5,
+ DICT_COL__SYS_COLUMNS__PREC = 6,
+ DICT_NUM_COLS__SYS_COLUMNS = 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+ DICT_FLD__SYS_COLUMNS__TABLE_ID = 0,
+ DICT_FLD__SYS_COLUMNS__POS = 1,
+ DICT_FLD__SYS_COLUMNS__DB_TRX_ID = 2,
+ DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_COLUMNS__NAME = 4,
+ DICT_FLD__SYS_COLUMNS__MTYPE = 5,
+ DICT_FLD__SYS_COLUMNS__PRTYPE = 6,
+ DICT_FLD__SYS_COLUMNS__LEN = 7,
+ DICT_FLD__SYS_COLUMNS__PREC = 8,
+ DICT_NUM_FIELDS__SYS_COLUMNS = 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+ DICT_COL__SYS_INDEXES__TABLE_ID = 0,
+ DICT_COL__SYS_INDEXES__ID = 1,
+ DICT_COL__SYS_INDEXES__NAME = 2,
+ DICT_COL__SYS_INDEXES__N_FIELDS = 3,
+ DICT_COL__SYS_INDEXES__TYPE = 4,
+ DICT_COL__SYS_INDEXES__SPACE = 5,
+ DICT_COL__SYS_INDEXES__PAGE_NO = 6,
+ DICT_COL__SYS_INDEXES__MERGE_THRESHOLD = 7,
+ DICT_NUM_COLS__SYS_INDEXES = 8
+};
+/* The field numbers in the SYS_INDEXES clustered index */
+enum dict_fld_sys_indexes_enum {
+ DICT_FLD__SYS_INDEXES__TABLE_ID = 0,
+ DICT_FLD__SYS_INDEXES__ID = 1,
+ DICT_FLD__SYS_INDEXES__DB_TRX_ID = 2,
+ DICT_FLD__SYS_INDEXES__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_INDEXES__NAME = 4,
+ DICT_FLD__SYS_INDEXES__N_FIELDS = 5,
+ DICT_FLD__SYS_INDEXES__TYPE = 6,
+ DICT_FLD__SYS_INDEXES__SPACE = 7,
+ DICT_FLD__SYS_INDEXES__PAGE_NO = 8,
+ DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD = 9,
+ DICT_NUM_FIELDS__SYS_INDEXES = 10
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+ DICT_COL__SYS_FIELDS__INDEX_ID = 0,
+ DICT_COL__SYS_FIELDS__POS = 1,
+ DICT_COL__SYS_FIELDS__COL_NAME = 2,
+ DICT_NUM_COLS__SYS_FIELDS = 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+ DICT_FLD__SYS_FIELDS__INDEX_ID = 0,
+ DICT_FLD__SYS_FIELDS__POS = 1,
+ DICT_FLD__SYS_FIELDS__DB_TRX_ID = 2,
+ DICT_FLD__SYS_FIELDS__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_FIELDS__COL_NAME = 4,
+ DICT_NUM_FIELDS__SYS_FIELDS = 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+ DICT_COL__SYS_FOREIGN__ID = 0,
+ DICT_COL__SYS_FOREIGN__FOR_NAME = 1,
+ DICT_COL__SYS_FOREIGN__REF_NAME = 2,
+ DICT_COL__SYS_FOREIGN__N_COLS = 3,
+ DICT_NUM_COLS__SYS_FOREIGN = 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+ DICT_FLD__SYS_FOREIGN__ID = 0,
+ DICT_FLD__SYS_FOREIGN__DB_TRX_ID = 1,
+ DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR = 2,
+ DICT_FLD__SYS_FOREIGN__FOR_NAME = 3,
+ DICT_FLD__SYS_FOREIGN__REF_NAME = 4,
+ DICT_FLD__SYS_FOREIGN__N_COLS = 5,
+ DICT_NUM_FIELDS__SYS_FOREIGN = 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+ DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME = 0,
+ DICT_FLD__SYS_FOREIGN_FOR_NAME__ID = 1,
+ DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME = 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+ DICT_COL__SYS_FOREIGN_COLS__ID = 0,
+ DICT_COL__SYS_FOREIGN_COLS__POS = 1,
+ DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME = 2,
+ DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME = 3,
+ DICT_NUM_COLS__SYS_FOREIGN_COLS = 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+ DICT_FLD__SYS_FOREIGN_COLS__ID = 0,
+ DICT_FLD__SYS_FOREIGN_COLS__POS = 1,
+ DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID = 2,
+ DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME = 4,
+ DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5,
+ DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6
+};
+/* The columns in SYS_VIRTUAL */
+enum dict_col_sys_virtual_enum {
+ DICT_COL__SYS_VIRTUAL__TABLE_ID = 0,
+ DICT_COL__SYS_VIRTUAL__POS = 1,
+ DICT_COL__SYS_VIRTUAL__BASE_POS = 2,
+ DICT_NUM_COLS__SYS_VIRTUAL = 3
+};
+/* The field numbers in the SYS_VIRTUAL clustered index */
+enum dict_fld_sys_virtual_enum {
+ DICT_FLD__SYS_VIRTUAL__TABLE_ID = 0,
+ DICT_FLD__SYS_VIRTUAL__POS = 1,
+ DICT_FLD__SYS_VIRTUAL__BASE_POS = 2,
+ DICT_FLD__SYS_VIRTUAL__DB_TRX_ID = 3,
+ DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR = 4,
+ DICT_NUM_FIELDS__SYS_VIRTUAL = 5
+};
+
+/* A number of the columns above occur in multiple tables. These are the
+length of thos fields. */
+#define DICT_FLD_LEN_SPACE 4
+#define DICT_FLD_LEN_FLAGS 4
+
+#endif
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
new file mode 100644
index 00000000..c40df12b
--- /dev/null
+++ b/storage/innobase/include/dict0crea.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "fil0crypt.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ dict_table_t* table, /*!< in: table to create, built as
+ a memory data structure */
+ mem_heap_t* heap); /*!< in: heap where created */
+
+/** Creates an index create graph.
+@param[in] index index to create, built as a memory data structure
+@param[in] table table name
+@param[in,out] heap heap where created
+@param[in] mode encryption mode (for creating a table)
+@param[in] key_id encryption key identifier (for creating a table)
+@param[in] add_v new virtual columns added in the same clause with
+ add index
+@return own: index create node */
+ind_node_t*
+ind_create_graph_create(
+ dict_index_t* index,
+ const char* table,
+ mem_heap_t* heap,
+ fil_encryption_t mode,
+ uint32_t key_id,
+ const dict_add_v_col_t* add_v = NULL);
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/***************************************************************//**
+Builds an index definition but doesn't update sys_table.
+@return DB_SUCCESS or error code */
+void
+dict_build_index_def(
+/*=================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index, /*!< in/out: index */
+ trx_t* trx); /*!< in/out: InnoDB transaction
+ handle */
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+dict_create_index_tree(
+/*===================*/
+ dict_index_t* index, /*!< in/out: index */
+ const trx_t* trx); /*!< in: InnoDB transaction handle */
+
+/** Drop the index tree associated with a row in SYS_INDEXES table.
+@param[in,out] pcur persistent cursor on rec
+@param[in,out] trx dictionary transaction
+@param[in,out] mtr mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return error code */
+dberr_t
+dict_create_index_tree_in_mem(
+/*==========================*/
+ dict_index_t* index, /*!< in/out: index */
+ const trx_t* trx); /*!< in: InnoDB transaction handle */
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18. */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+ ulint* id_nr, /*!< in/out: number to use in id
+ generation; incremented if used */
+ const char* name, /*!< in: table name */
+ dict_foreign_t* foreign); /*!< in/out: foreign key */
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in] local_fk_set set of foreign key objects, to be added to
+the dictionary tables
+@param[in] table table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out] trx transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+ const dict_foreign_set& local_fk_set,
+ const dict_table_t* table,
+ trx_t* trx)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a foreign constraint is on columns server as base columns
+of any stored column. This is to prevent creating SET NULL or CASCADE
+constraint on such columns
+@param[in] local_fk_set set of foreign key objects, to be added to
+the dictionary tables
+@param[in] table table to which the foreign key objects in
+local_fk_set belong to
+@return true if yes, otherwise, false */
+bool
+dict_foreigns_has_s_base_col(
+ const dict_foreign_set& local_fk_set,
+ const dict_table_t* table);
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+ const char* name, /*!< in: table name */
+ const dict_foreign_t* foreign,/*!< in: foreign key */
+ trx_t* trx) /*!< in/out: dictionary transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Table create node structure */
+struct tab_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */
+ dict_table_t* table; /*!< table to create, built as a
+ memory data structure with
+ dict_mem_... functions */
+ ins_node_t* tab_def; /*!< child node which does the insert of
+ the table definition; the row to be
+ inserted is built by the parent node */
+ ins_node_t* col_def; /*!< child node which does the inserts
+ of the column definitions; the row to
+ be inserted is built by the parent
+ node */
+ ins_node_t* v_col_def; /*!< child node which does the inserts
+ of the sys_virtual row definitions;
+ the row to be inserted is built by
+ the parent node */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ ulint col_no; /*!< next column definition to insert */
+ ulint base_col_no; /*!< next base column to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary
+ storage */
+};
+
+/* Table create node states */
+#define TABLE_BUILD_TABLE_DEF 1
+#define TABLE_BUILD_COL_DEF 2
+#define TABLE_BUILD_V_COL_DEF 3
+#define TABLE_ADD_TO_CACHE 4
+#define TABLE_COMPLETED 5
+
+/* Index create node struct */
+
+struct ind_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */
+ dict_index_t* index; /*!< index to create, built as a
+ memory data structure with
+ dict_mem_... functions */
+ const char* table_name; /*!< table name */
+ ins_node_t* ind_def; /*!< child node which does the insert of
+ the index definition; the row to be
+ inserted is built by the parent node */
+ ins_node_t* field_def; /*!< child node which does the inserts
+ of the field definitions; the row to
+ be inserted is built by the parent
+ node */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ uint32_t page_no; /* root page number of the index */
+ dtuple_t* ind_row; /* index definition row built */
+ ulint field_no; /* next field definition to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary
+ storage */
+ uint key_id; /*!< encryption key_id */
+ fil_encryption_t mode; /*!< encryption mode */
+ const dict_add_v_col_t*
+ add_v; /*!< new virtual columns that being
+ added along with an add index call */
+};
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in] v_pos virtual column sequence
+@param[in] col_pos column position in original table definition
+@return composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+ ulint v_pos,
+ ulint col_pos);
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+ ulint pos);
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+ ulint pos);
+
+/* Index create node states */
+#define INDEX_BUILD_INDEX_DEF 1
+#define INDEX_BUILD_FIELD_DEF 2
+#define INDEX_CREATE_INDEX_TREE 3
+#define INDEX_ADD_TO_CACHE 4
+
+#include "dict0crea.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0crea.inl b/storage/innobase/include/dict0crea.inl
new file mode 100644
index 00000000..5641206d
--- /dev/null
+++ b/storage/innobase/include/dict0crea.inl
@@ -0,0 +1,136 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "ha_prototypes.h"
+
+#include "mem0mem.h"
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18. */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+ ulint* id_nr, /*!< in/out: number to use in id generation;
+ incremented if used */
+ const char* name, /*!< in: table name */
+ dict_foreign_t* foreign)/*!< in/out: foreign key */
+{
+ DBUG_ENTER("dict_create_add_foreign_id");
+
+ if (foreign->id == NULL) {
+ /* Generate a new constraint id */
+ ulint namelen = strlen(name);
+ char* id = static_cast<char*>(
+ mem_heap_alloc(foreign->heap,
+ namelen + 20));
+
+ if (dict_table_t::is_temporary_name(name)) {
+
+ /* no overflow if number < 1e13 */
+ sprintf(id, "%s_ibfk_%lu", name,
+ (ulong) (*id_nr)++);
+ } else {
+ char table_name[MAX_TABLE_NAME_LEN + 21];
+ uint errors = 0;
+
+ strncpy(table_name, name, (sizeof table_name) - 1);
+ table_name[(sizeof table_name) - 1] = '\0';
+
+ innobase_convert_to_system_charset(
+ strchr(table_name, '/') + 1,
+ strchr(name, '/') + 1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ strncpy(table_name, name,
+ (sizeof table_name) - 1);
+ table_name[(sizeof table_name) - 1] = '\0';
+ }
+
+ /* no overflow if number < 1e13 */
+ sprintf(id, "%s_ibfk_%lu", table_name,
+ (ulong) (*id_nr)++);
+
+ if (innobase_check_identifier_length(
+ strchr(id,'/') + 1)) {
+ DBUG_RETURN(DB_IDENTIFIER_TOO_LONG);
+ }
+ }
+ foreign->id = id;
+
+ DBUG_PRINT("dict_create_add_foreign_id",
+ ("generated foreign id: %s", id));
+ }
+
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in] v_pos virtual column sequence
+@param[in] col_pos column position in original table definition
+@return composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+ ulint v_pos,
+ ulint col_pos)
+{
+ ut_ad(v_pos <= REC_MAX_N_FIELDS);
+ ut_ad(col_pos <= REC_MAX_N_FIELDS);
+
+ return(((v_pos + 1) << 16) + col_pos);
+}
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+ ulint pos)
+{
+ return(pos & 0xFFFF);
+}
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+ ulint pos)
+{
+ return((pos >> 16) - 1);
+}
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
new file mode 100644
index 00000000..679484ad
--- /dev/null
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0defrag_bg.h
+Code used for background table and index
+defragmentation
+
+Created 25/08/2016 Jan Lindström
+*******************************************************/
+
+#ifndef dict0defrag_bg_h
+#define dict0defrag_bg_h
+
+#include "dict0types.h"
+
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+ table_id_t table_id;
+ index_id_t index_id;
+};
+
+/** Allocator type, used by std::vector */
+typedef ut_allocator<defrag_pool_item_t>
+ defrag_pool_allocator_t;
+
+/** The multitude of tables to be defragmented- an STL vector */
+typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t>
+ defrag_pool_t;
+
+/** Pool where we store information on which tables are to be processed
+by background defragmentation. */
+extern defrag_pool_t defrag_pool;
+
+/*****************************************************************//**
+Initialize the defrag pool, called once during thread initialization. */
+void
+dict_defrag_pool_init(void);
+/*========================*/
+
+/*****************************************************************//**
+Free the resources occupied by the defrag pool, called once during
+thread de-initialization. */
+void
+dict_defrag_pool_deinit(void);
+/*==========================*/
+
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index); /*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index); /*!< in: index to remove */
+
+/**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+void dict_defrag_process_entries_from_defrag_pool(THD *thd);
+
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_stats(
+/*============================*/
+ dict_index_t* index); /*!< in: index */
+#endif /* dict0defrag_bg_h */
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
new file mode 100644
index 00000000..5fafb2c5
--- /dev/null
+++ b/storage/innobase/include/dict0dict.h
@@ -0,0 +1,1744 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "data0data.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+#include "srw_lock.h"
+#include <my_sys.h>
+#include <deque>
+
+class MDL_ticket;
+
+/** the first table or index ID for other than hard-coded system tables */
+constexpr uint8_t DICT_HDR_FIRST_ID= 10;
+
+
+/** Get the database name length in a table name.
+@param name filename-safe encoded table name "dbname/tablename"
+@return database name length */
+inline size_t dict_get_db_name_len(const char *name)
+{
+ /* table_name_t::dblen() would assert that '/' is contained */
+ if (const char* s= strchr(name, '/'))
+ return size_t(s - name);
+
+ return 0;
+}
+
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+char*
+dict_get_referenced_table(
+/*======================*/
+ const char* name, /*!< in: foreign key table name */
+ const char* database_name, /*!< in: table db name */
+ ulint database_name_len,/*!< in: db name length */
+ const char* table_name, /*!< in: table name */
+ ulint table_name_len, /*!< in: table name length */
+ dict_table_t** table, /*!< out: table object or NULL */
+ mem_heap_t* heap, /*!< in: heap memory */
+ CHARSET_INFO* from_cs); /*!< in: table name charset */
+/*********************************************************************//**
+Frees a foreign key struct. */
+void
+dict_foreign_free(
+/*==============*/
+ dict_foreign_t* foreign); /*!< in, own: foreign key struct */
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+ dict_table_t* table); /*!< in: table in the dictionary
+ memory cache */
+/** Check whether the dict_table_t is a partition.
+A partitioned table on the SQL level is composed of InnoDB tables,
+where each InnoDB table is a [sub]partition including its secondary indexes
+which belongs to the partition.
+@param[in] table Table to check.
+@return true if the dict_table_t is a partition else false. */
+UNIV_INLINE
+bool
+dict_table_is_partition(const dict_table_t* table)
+{
+ /* Check both P and p on all platforms in case it was moved to/from
+ WIN. */
+ return (strstr(table->name.m_name, "#p#")
+ || strstr(table->name.m_name, "#P#"));
+}
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+const char*
+dict_remove_db_name(
+/*================*/
+ const char* name) /*!< in: table name in the form
+ dbname '/' tablename */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Operation to perform when opening a table */
+enum dict_table_op_t {
+ /** Expect the tablespace to exist. */
+ DICT_TABLE_OP_NORMAL = 0,
+ /** Drop any orphan indexes after an aborted online index creation */
+ DICT_TABLE_OP_DROP_ORPHAN,
+ /** Silently load the tablespace if it does not exist,
+ and do not load the definitions of incomplete indexes. */
+ DICT_TABLE_OP_LOAD_TABLESPACE,
+ /** Open the table only if it's in table cache. */
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
+};
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out] table table object
+@param[in,out] thd background thread
+@param[out] mdl mdl ticket
+@param[in] table_op operation to perform when opening
+@return table object after locking MDL shared
+@retval NULL if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+ THD *thd,
+ MDL_ticket **mdl,
+ dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
+
+/** Look up a table by numeric identifier.
+@param[in] table_id table identifier
+@param[in] dict_locked data dictionary locked
+@param[in] table_op operation to perform when opening
+@param[in,out] thd background thread, or NULL to not acquire MDL
+@param[out] mdl mdl ticket, or NULL
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+ dict_table_op_t table_op, THD *thd= nullptr,
+ MDL_ticket **mdl= nullptr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table);
+
+/** Decrements the count of open handles of a table.
+@param[in,out] table table
+@param[in] dict_locked whether dict_sys.latch is being held
+@param[in] thd thread to release MDL
+@param[in] mdl metadata lock or NULL if the thread is a
+ foreground one. */
+void
+dict_table_close(
+ dict_table_t* table,
+ bool dict_locked,
+ THD* thd = NULL,
+ MDL_ticket* mdl = NULL);
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type); /*!< out: data type */
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+ dict_table_t* table, /*!< in: table */
+ const dict_col_t* col) /*!< in: column which index prefix
+ is based on */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in] table dict_table_t for the table
+@param[in] col_no virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+ dict_table_t* table,
+ ulint col_no);
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type) /*!< in: data type */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index) /*!< in: clustered index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Gets the column position in the given index.
+@param[in] col table column
+@param[in] index index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+ const dict_col_t* col,
+ const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+ const char* name) /*!< in: column name */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Unconditionally set the AUTO_INCREMENT counter.
+@param[in,out] table table or partition
+@param[in] value next available AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+void
+dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value)
+{
+ table->autoinc = value;
+}
+
+/**
+@param[in] table table or partition
+@return the next AUTO_INCREMENT counter value
+@retval 0 if AUTO_INCREMENT is not yet initialized */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+UNIV_INLINE
+ib_uint64_t
+dict_table_autoinc_read(const dict_table_t* table)
+{
+ return(table->autoinc);
+}
+
+/** Update the AUTO_INCREMENT sequence if the value supplied is greater
+than the current value.
+@param[in,out] table table or partition
+@param[in] value AUTO_INCREMENT value that was assigned to a row
+@return whether the AUTO_INCREMENT sequence was updated */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+bool
+dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value)
+{
+ if (value > table->autoinc) {
+
+ table->autoinc = value;
+ return(true);
+ }
+
+ return(false);
+}
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+void
+dict_table_add_system_columns(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap) /*!< in: temporary heap */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ span<const char> new_name, /*!< in: new name */
+ bool replace_new_file)
+ /*!< in: whether to replace the
+ file with the new name
+ (as part of rolling back TRUNCATE) */
+ MY_ATTRIBUTE((nonnull));
+
+/** Removes an index from the dictionary cache.
+@param[in,out] table table whose index to remove
+@param[in,out] index index to remove, this object is destroyed and must not
+be accessed by the caller afterwards */
+void
+dict_index_remove_from_cache(
+ dict_table_t* table,
+ dict_index_t* index);
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table object already in cache */
+ table_id_t new_id) /*!< in: new id to set */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+ dict_foreign_t* foreign) /*!< in, own: foreign constraint */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return DB_SUCCESS or error code */
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+ dict_foreign_t* foreign,
+ /*!< in, own: foreign key constraint */
+ const char** col_names,
+ /*!< in: column names, or NULL to use
+ foreign->foreign_table->col_names */
+ bool check_charsets,
+ /*!< in: whether to check charset
+ compatibility */
+ dict_err_ignore_t ignore_err)
+ /*!< in: error to be ignored */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const dict_index_t* index) /*!< in: index to be replaced */
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+ mem_heap_t* heap, /*!< in: heap from which we can
+ allocate memory */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ ulint* n, /*!< out: number of constraints
+ to drop */
+ const char*** constraints_to_drop) /*!< out: id's of the
+ constraints to drop */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@param[in] table_name Table name
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
+@param[in] ignore_err error to be ignored when loading the table
+@return table
+@retval nullptr if does not exist */
+dict_table_t*
+dict_table_open_on_name(
+ const char* table_name,
+ bool dict_locked,
+ dict_err_ignore_t ignore_err)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Outcome of dict_foreign_find_index() or dict_foreign_qualify_index() */
+enum fkerr_t
+{
+ /** A backing index was found for a FOREIGN KEY constraint */
+ FK_SUCCESS = 0,
+ /** There is no index that covers the columns in the constraint. */
+ FK_INDEX_NOT_FOUND,
+ /** The index is for a prefix index, not a full column. */
+ FK_IS_PREFIX_INDEX,
+ /** A condition of SET NULL conflicts with a NOT NULL column. */
+ FK_COL_NOT_NULL,
+ /** The column types do not match */
+ FK_COLS_NOT_EQUAL
+};
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ const dict_index_t* types_idx,
+ /*!< in: NULL or an index
+ whose types the column types
+ must match */
+ bool check_charsets,
+ /*!< in: whether to check
+ charsets. only has an effect
+ if types_idx != NULL */
+ ulint check_null,
+ /*!< in: nonzero if none of
+ the columns must be declared
+ NOT NULL */
+ fkerr_t* error = NULL, /*!< out: error code */
+ ulint* err_col_no = NULL,
+ /*!< out: column number where
+ error happened */
+ dict_index_t** err_index = NULL)
+ /*!< out: index where error
+ happened */
+
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/** Returns a virtual column's name.
+@param[in] table table object
+@param[in] col_nr virtual column number(nth virtual column)
+@return column name. */
+const char*
+dict_table_get_v_col_name(
+ const dict_table_t* table,
+ ulint col_nr);
+
+/** Check if the table has a given column.
+@param[in] table table object
+@param[in] col_name column name
+@param[in] col_nr column number guessed, 0 as default
+@return column number if the table has the specified column,
+otherwise table->n_def */
+ulint
+dict_table_has_column(
+ const dict_table_t* table,
+ const char* col_name,
+ ulint col_nr = 0);
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+std::string
+dict_print_info_on_foreign_keys(
+/*============================*/
+ ibool create_table_format, /*!< in: if TRUE then print in
+ a format suitable to be inserted into
+ a CREATE TABLE, otherwise in the format
+ of SHOW TABLE STATUS */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table */
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+std::string
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ ibool add_newline); /*!< in: whether to add a newline */
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+bool
+dict_foreign_qualify_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ const dict_index_t* index, /*!< in: index to check */
+ const dict_index_t* types_idx,
+ /*!< in: NULL or an index
+ whose types the column types
+ must match */
+ bool check_charsets,
+ /*!< in: whether to check
+ charsets. only has an effect
+ if types_idx != NULL */
+ ulint check_null,
+ /*!< in: nonzero if none of
+ the columns must be declared
+ NOT NULL */
+ fkerr_t* error, /*!< out: error code */
+ ulint* err_col_no,
+ /*!< out: column number where
+ error happened */
+ dict_index_t** err_index)
+ /*!< out: index where error
+ happened */
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+
+#define dict_index_is_clust(index) (index)->is_clust()
+#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
+#define dict_index_is_unique(index) (index)->is_unique()
+#define dict_index_is_spatial(index) (index)->is_spatial()
+#define dict_index_is_ibuf(index) (index)->is_ibuf()
+#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary()
+#define dict_index_has_virtual(index) (index)->has_virtual()
+
+/** Get all the FTS indexes on a table.
+@param[in] table table
+@param[out] indexes all FTS indexes on this table
+@return number of FTS indexes */
+ulint
+dict_table_get_all_fts_indexes(
+ const dict_table_t* table,
+ ib_vector_t* indexes);
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in] table the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+ const dict_table_t* table);
+
+/** Check if a table has indexed virtual columns
+@param[in] table the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+ const dict_table_t* table);
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+ MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+ MY_ATTRIBUTE((nonnull));
+
+/** Get nth virtual column
+@param[in] table target table
+@param[in] col_nr column number in MySQL Table definition
+@return dict_v_col_t ptr */
+dict_v_col_t*
+dict_table_get_nth_v_col_mysql(
+ const dict_table_t* table,
+ ulint col_nr);
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos) /*!< in: position of column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Gets the nth virtual column of a table.
+@param[in] table table
+@param[in] pos position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+ const dict_table_t* table,
+ ulint pos);
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos) (&(table)->cols[pos])
+#define dict_table_get_sys_col(table, sys) \
+ &(table)->cols[(table)->n_cols + (sys) - DATA_N_SYS_COLS]
+/* Get nth virtual columns */
+#define dict_table_get_nth_v_col(table, pos) (&(table)->v_cols[pos])
+#endif /* UNIV_DEBUG */
+/** Wrapper function.
+@see dict_col_t::name()
+@param[in] table table
+@param[in] col_nr column number in table
+@return column name */
+inline
+const char*
+dict_table_get_col_name(const dict_table_t* table, ulint col_nr)
+{
+ return(dict_table_get_nth_col(table, col_nr)->name(*table));
+}
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define dict_table_is_comp(table) (table)->not_redundant()
+
+/** Determine if a table uses atomic BLOBs (no locally stored prefix).
+@param[in] table InnoDB table
+@return whether BLOBs are atomic */
+inline
+bool
+dict_table_has_atomic_blobs(const dict_table_t* table)
+{
+ return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags));
+}
+
+/** @return potential max length stored inline for externally stored fields */
+inline size_t dict_table_t::get_overflow_field_local_len() const
+{
+ if (dict_table_has_atomic_blobs(this)) {
+ /* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not
+ store any BLOB prefix locally */
+ return BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ /* up to MySQL 5.1: store a 768-byte prefix locally */
+ return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out] flags, Pointer to a 4 byte Table Flags
+@param[in] format, File Format
+@param[in] zip_ssize Zip Shift Size
+@param[in] use_data_dir Table uses DATA DIRECTORY
+@param[in] page_compressed Table uses page compression
+@param[in] page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+ ulint* flags,
+ rec_format_t format,
+ ulint zip_ssize,
+ bool use_data_dir,
+ bool page_compressed,
+ ulint page_compression_level);
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+ | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags | 0 | 1 | 1 | 1
+fil_space_t::flags | 0 | 0 | 1 | 1
+==================================================================
+@param[in] table_flags dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags)
+ MY_ATTRIBUTE((const));
+
+/** Extract the ROW_FORMAT=COMPRESSED page size from table flags.
+@param[in] flags flags
+@return ROW_FORMAT=COMPRESSED page size
+@retval 0 if not compressed */
+inline ulint dict_tf_get_zip_size(ulint flags)
+{
+ flags &= DICT_TF_MASK_ZIP_SSIZE;
+ return flags
+ ? (UNIV_ZIP_SIZE_MIN >> 1)
+ << (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE
+ << FSP_FLAGS_POS_ZIP_SSIZE))
+ : 0;
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n) /*!< in: column number */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+ dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Copies types of virtual columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create().
+@param[in,out] tuple data tuple
+@param[in] table table
+*/
+void
+dict_table_copy_v_types(
+ dtuple_t* tuple,
+ const dict_table_t* table);
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create(). */
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull));
+/** Adds an index to the dictionary cache, with possible indexing newly
+added column.
+@param[in,out] index index; NOTE! The index memory
+ object is freed in this function!
+@param[in] page_no root page number of the index
+@param[in] add_v virtual columns being added along with ADD INDEX
+@return DB_SUCCESS, or DB_CORRUPTION */
+dberr_t
+dict_index_add_to_cache(
+ dict_index_t*& index,
+ ulint page_no,
+ const dict_add_v_col_t* add_v = NULL)
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** The number of fields in the nonleaf page of spatial index, except
+the page no field. */
+#define DICT_INDEX_SPATIAL_NODEPTR_SIZE 1
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in] index index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+ const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Looks for column n in an index.
+@param[in] index index
+@param[in] n column number
+@param[in] inc_prefix true=consider column prefixes too
+@param[in] is_virtual true==virtual column
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+ const dict_index_t* index, /*!< in: index */
+ ulint n, /*!< in: column number */
+ bool inc_prefix, /*!< in: TRUE=consider
+ column prefixes too */
+ bool is_virtual, /*!< in: is a virtual column
+ */
+ ulint* prefix_col_pos) /*!< out: col num if prefix
+ */
+ __attribute__((warn_unused_result));
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index from which to search */
+ const dict_index_t* index2, /*!< in: index */
+ ulint n) /*!< in: field number in index2 */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+unsigned
+dict_table_get_nth_col_pos(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/** Add a column to an index.
+@param index index
+@param table table
+@param col column
+@param prefix_len column prefix length
+@param descending whether to use descending order */
+void dict_index_add_col(dict_index_t *index, const dict_table_t *table,
+ dict_col_t *col, ulint prefix_len,
+ bool descending= false)
+ MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_index_t* index, /*!< in: index */
+ ulint n_fields) /*!< in: number of
+ field types to copy */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field) /*!< in: index field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+ index_id_t index_id) /*!< in: index id */
+ MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ index_id_t index_id) /*!< in: index id */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index tree */
+ const dtuple_t* tuple) /*!< in: tuple used in a search */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Whether and when to allow temporary index names */
+enum check_name {
+ /** Require all indexes to be complete. */
+ CHECK_ALL_COMPLETE,
+ /** Allow aborted online index creation. */
+ CHECK_ABORTED_OK,
+ /** Allow partial indexes to exist. */
+ CHECK_PARTIAL_OK
+};
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+ const dict_table_t* table, /*!< in: Check for dup indexes
+ in this table */
+ enum check_name check) /*!< in: whether and when to allow
+ temporary index names */
+ MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap, /*!< in: memory heap where pointer
+ created */
+ ulint level) /*!< in: level of rec in tree:
+ 0 means leaf level */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Convert a physical record into a search tuple.
+@param[in] rec index record (not necessarily in an index page)
+@param[in] index index
+@param[in] leaf whether rec is in a leaf page
+@param[in] n_fields number of data fields
+@param[in,out] heap memory heap for allocation
+@return own: data tuple */
+dtuple_t*
+dict_index_build_data_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ bool leaf,
+ ulint n_fields,
+ mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+ const dict_index_t* tree) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+
+/* Online index creation @{ */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+ const dict_index_t* index) /*!< in: secondary index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+ dict_index_t* index, /*!< in/out: index */
+ enum online_index_status status) /*!< in: status */
+ MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+ const char* name1, /*!< in: table name in the form
+ dbname '/' tablename */
+ const char* name2) /*!< in: table name in the form
+ dbname '/' tablename */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get an index by name.
+@param[in] table the table where to look for the index
+@param[in] name the index name to look for
+@return index, NULL if does not exist */
+dict_index_t*
+dict_table_get_index_on_name(dict_table_t* table, const char* name)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Get an index by name.
+@param[in] table the table where to look for the index
+@param[in] name the index name to look for
+@return index, NULL if does not exist */
+inline
+const dict_index_t*
+dict_table_get_index_on_name(const dict_table_t* table, const char* name)
+{
+ return dict_table_get_index_on_name(const_cast<dict_table_t*>(table),
+ name);
+}
+
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+ /* out: ULINT_UNDEFINED if no match else
+ the offset within the vector */
+ ib_vector_t* indexes,/* in: vector containing only FTS indexes */
+ ulint col_no, /* in: col number to search for */
+ bool is_virtual)/*!< in: whether it is a virtual column */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Looks for an index with the given id given a table instance.
+@param[in] table table instance
+@param[in] id index id
+@return index or NULL */
+dict_index_t*
+dict_table_find_index_on_id(
+ const dict_table_t* table,
+ index_id_t id)
+ MY_ATTRIBUTE((nonnull(1)));
+
+/** Maximum number of columns in a foreign key constraint. Please Note MySQL
+has a much lower limit on the number of columns allowed in a foreign key
+constraint */
+#define MAX_NUM_FK_COLUMNS 500
+
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE* dict_foreign_err_file;
+extern mysql_mutex_t dict_foreign_err_mutex;
+
+/** InnoDB data dictionary cache */
+class dict_sys_t
+{
+ /** The my_hrtime_coarse().val of the oldest lock_wait() start, or 0 */
+ std::atomic<ulonglong> latch_ex_wait_start;
+
+ /** the rw-latch protecting the data dictionary cache */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
+#ifdef UNIV_DEBUG
+ /** whether latch is being held in exclusive mode (by any thread) */
+ Atomic_relaxed<pthread_t> latch_ex;
+ /** number of S-latch holders */
+ Atomic_counter<uint32_t> latch_readers;
+#endif
+public:
+ /** Indexes of SYS_TABLE[] */
+ enum
+ {
+ SYS_TABLES= 0,
+ SYS_INDEXES,
+ SYS_COLUMNS,
+ SYS_FIELDS,
+ SYS_FOREIGN,
+ SYS_FOREIGN_COLS,
+ SYS_VIRTUAL
+ };
+ /** System table names */
+ static const span<const char> SYS_TABLE[];
+
+ /** all tables (persistent and temporary), hashed by name */
+ hash_table_t table_hash;
+ /** hash table of persistent table IDs */
+ hash_table_t table_id_hash;
+
+ /** the SYS_TABLES table */
+ dict_table_t *sys_tables;
+ /** the SYS_COLUMNS table */
+ dict_table_t *sys_columns;
+ /** the SYS_INDEXES table */
+ dict_table_t *sys_indexes;
+ /** the SYS_FIELDS table */
+ dict_table_t *sys_fields;
+ /** the SYS_FOREIGN table */
+ dict_table_t *sys_foreign;
+ /** the SYS_FOREIGN_COLS table */
+ dict_table_t *sys_foreign_cols;
+ /** the SYS_VIRTUAL table */
+ dict_table_t *sys_virtual;
+
+ /** @return whether all non-hard-coded system tables exist */
+ bool sys_tables_exist() const
+ { return UNIV_LIKELY(sys_foreign && sys_foreign_cols && sys_virtual); }
+
+ /** list of persistent tables that can be evicted */
+ UT_LIST_BASE_NODE_T(dict_table_t) table_LRU;
+ /** list of persistent tables that cannot be evicted */
+ UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU;
+
+private:
+ bool m_initialised= false;
+ /** the sequence of temporary table IDs */
+ std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID};
+ /** hash table of temporary table IDs */
+ hash_table_t temp_id_hash;
+ /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID
+ (FIXME: remove this, and move to dict_table_t) */
+ Atomic_relaxed<row_id_t> row_id;
+ /** The synchronization interval of row_id */
+ static constexpr size_t ROW_ID_WRITE_MARGIN= 256;
+public:
+ /** Diagnostic message for exceeding the lock_wait() timeout */
+ static const char fatal_msg[];
+
+ /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+ inline row_id_t get_new_row_id();
+
+ /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+ inline void update_row_id(row_id_t id);
+
+ /** Recover the global DB_ROW_ID sequence on database startup */
+ void recover_row_id(row_id_t id)
+ {
+ row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN;
+ }
+
+ /** @return a new temporary table ID */
+ table_id_t acquire_temporary_table_id()
+ {
+ return temp_table_id.fetch_add(1, std::memory_order_relaxed);
+ }
+
+ /** Look up a temporary table.
+ @param id temporary table ID
+ @return temporary table
+ @retval nullptr if the table does not exist
+ (should only happen during the rollback of CREATE...SELECT) */
+ dict_table_t *acquire_temporary_table(table_id_t id)
+ {
+ ut_ad(frozen());
+ dict_table_t *table;
+ ulint fold = ut_fold_ull(id);
+ HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
+ ut_ad(table->cached), table->id == id);
+ if (UNIV_LIKELY(table != nullptr))
+ {
+ DBUG_ASSERT(table->is_temporary());
+ DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
+ table->acquire();
+ }
+ return table;
+ }
+
+ /** Look up a persistent table.
+ @param id table ID
+ @return table
+ @retval nullptr if not cached */
+ dict_table_t *find_table(table_id_t id)
+ {
+ ut_ad(frozen());
+ dict_table_t *table;
+ ulint fold= ut_fold_ull(id);
+ HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, table,
+ ut_ad(table->cached), table->id == id);
+ DBUG_ASSERT(!table || !table->is_temporary());
+ return table;
+ }
+
+ bool is_initialised() const { return m_initialised; }
+
+ /** Initialise the data dictionary cache. */
+ void create();
+
+ /** Close the data dictionary cache on shutdown. */
+ void close();
+
+ /** Resize the hash tables based on the current buffer pool size. */
+ void resize();
+
+ /** Add a table definition to the data dictionary cache */
+ inline void add(dict_table_t* table);
+ /** Remove a table definition from the data dictionary cache.
+ @param[in,out] table cached table definition to be evicted
+ @param[in] lru whether this is part of least-recently-used evictiono
+ @param[in] keep whether to keep (not free) the object */
+ void remove(dict_table_t* table, bool lru = false, bool keep = false);
+
+#ifdef UNIV_DEBUG
+ /** Find a table */
+ template <bool in_lru> bool find(const dict_table_t *table)
+ {
+ ut_ad(table);
+ ut_ad(table->can_be_evicted == in_lru);
+ ut_ad(frozen());
+ for (const dict_table_t* t= in_lru ? table_LRU.start : table_non_LRU.start;
+ t; t = UT_LIST_GET_NEXT(table_LRU, t))
+ {
+ if (t == table) return true;
+ ut_ad(t->can_be_evicted == in_lru);
+ }
+ return false;
+ }
+ /** Find a table */
+ bool find(const dict_table_t *table)
+ {
+ return table->can_be_evicted ? find<true>(table) : find<false>(table);
+ }
+#endif
+
+ /** Move a table to the non-LRU list from the LRU list. */
+ void prevent_eviction(dict_table_t *table)
+ {
+ ut_d(locked());
+ ut_ad(find(table));
+ if (!table->can_be_evicted)
+ return;
+ table->can_be_evicted= false;
+ UT_LIST_REMOVE(table_LRU, table);
+ UT_LIST_ADD_LAST(table_non_LRU, table);
+ }
+
+#ifdef UNIV_DEBUG
+ /** @return whether any thread (not necessarily the current thread)
+ is holding the latch; that is, this check may return false
+ positives */
+ bool frozen() const { return latch_readers || latch_ex; }
+ /** @return whether any thread (not necessarily the current thread)
+ is holding a shared latch */
+ bool frozen_not_locked() const { return latch_readers; }
+ /** @return whether the current thread holds the exclusive latch */
+ bool locked() const { return latch_ex == pthread_self(); }
+#endif
+private:
+ /** Acquire the exclusive latch */
+ ATTRIBUTE_NOINLINE
+ void lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line));
+public:
+ /** @return the my_hrtime_coarse().val of the oldest lock_wait() start,
+ assuming that requests are served on a FIFO basis */
+ ulonglong oldest_wait() const
+ { return latch_ex_wait_start.load(std::memory_order_relaxed); }
+
+ /** Exclusively lock the dictionary cache. */
+ void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
+ {
+ if (latch.wr_lock_try())
+ {
+ ut_ad(!latch_readers);
+ ut_ad(!latch_ex);
+ ut_d(latch_ex= pthread_self());
+ }
+ else
+ lock_wait(SRW_LOCK_ARGS(file, line));
+ }
+
+#ifdef UNIV_PFS_RWLOCK
+ /** Unlock the data dictionary cache. */
+ ATTRIBUTE_NOINLINE void unlock();
+ /** Acquire a shared lock on the dictionary cache. */
+ ATTRIBUTE_NOINLINE void freeze(const char *file, unsigned line);
+ /** Release a shared lock on the dictionary cache. */
+ ATTRIBUTE_NOINLINE void unfreeze();
+#else
+ /** Unlock the data dictionary cache. */
+ void unlock()
+ {
+ ut_ad(latch_ex == pthread_self());
+ ut_ad(!latch_readers);
+ ut_d(latch_ex= 0);
+ latch.wr_unlock();
+ }
+ /** Acquire a shared lock on the dictionary cache. */
+ void freeze()
+ {
+ latch.rd_lock();
+ ut_ad(!latch_ex);
+ ut_d(latch_readers++);
+ }
+ /** Release a shared lock on the dictionary cache. */
+ void unfreeze()
+ {
+ ut_ad(!latch_ex);
+ ut_ad(latch_readers--);
+ latch.rd_unlock();
+ }
+#endif
+
+ /** Estimate the used memory occupied by the data dictionary
+ table and index objects.
+ @return number of bytes occupied */
+ TPOOL_SUPPRESS_TSAN ulint rough_size() const
+ {
+ /* No latch; this is a very crude approximation anyway */
+ ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
+ size *= sizeof(dict_table_t)
+ + sizeof(dict_index_t) * 2
+ + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
+ + sizeof(dict_field_t) * 5 /* total number of key fields */
+ + 200; /* arbitrary, covering names and overhead */
+ size += (table_hash.n_cells + table_id_hash.n_cells +
+ temp_id_hash.n_cells) * sizeof(hash_cell_t);
+ return size;
+ }
+
+ /** Evict unused, unlocked tables from table_LRU.
+ @param half whether to consider half the tables only (instead of all)
+ @return number of tables evicted */
+ ulint evict_table_LRU(bool half);
+
+ /** Look up a table in the dictionary cache.
+ @param name table name
+ @return table handle
+ @retval nullptr if not found */
+ dict_table_t *find_table(const span<const char> &name) const
+ {
+ ut_ad(frozen());
+ for (dict_table_t *table= static_cast<dict_table_t*>
+ (HASH_GET_FIRST(&table_hash, table_hash.calc_hash
+ (my_crc32c(0, name.data(), name.size()))));
+ table; table= table->name_hash)
+ if (strlen(table->name.m_name) == name.size() &&
+ !memcmp(table->name.m_name, name.data(), name.size()))
+ return table;
+ return nullptr;
+ }
+
+ /** Look up or load a table definition
+ @param name table name
+ @param ignore errors to ignore when loading the table definition
+ @return table handle
+ @retval nullptr if not found */
+ dict_table_t *load_table(const span<const char> &name,
+ dict_err_ignore_t ignore= DICT_ERR_IGNORE_NONE);
+
+ /** Attempt to load the system tables on startup
+ @return whether any discrepancy with the expected definition was found */
+ bool load_sys_tables();
+ /** Create or check system tables on startup */
+ dberr_t create_or_check_sys_tables();
+};
+
+/** the data dictionary cache */
+extern dict_sys_t dict_sys;
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+void
+dict_fs2utf8(
+/*=========*/
+ const char* db_and_table, /*!< in: database and table names,
+ e.g. d@i1b/a@q1b@1Kc */
+ char* db_utf8, /*!< out: database name, e.g. dцb */
+ size_t db_utf8_size, /*!< in: dbname_utf8 size */
+ char* table_utf8, /*!< out: table name, e.g. aюbØc */
+ size_t table_utf8_size)/*!< in: table_utf8 size */
+ MY_ATTRIBUTE((nonnull));
+
+/** Flag an index corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES.
+@param index index to be flagged as corrupted
+@param ctx context (for error log reporting) */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
+ ATTRIBUTE_COLD __attribute__((nonnull));
+
+/** Sets merge_threshold in the SYS_INDEXES
+@param[in,out] index index
+@param[in] merge_threshold value to set */
+void
+dict_index_set_merge_threshold(
+ dict_index_t* index,
+ ulint merge_threshold);
+
+#ifdef UNIV_DEBUG
+/** Sets merge_threshold for all indexes in dictionary cache for debug.
+@param[in] merge_threshold_all value to set for all indexes */
+void
+dict_set_merge_threshold_all_debug(
+ uint merge_threshold_all);
+#endif /* UNIV_DEBUG */
+
+/** Validate the table flags.
+@param[in] flags Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+ ulint flags);
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in] flags Table flags
+@param[in] flags2 Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+ ulint flags,
+ ulint flags2);
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+void
+dict_index_zip_success(
+/*===================*/
+ dict_index_t* index) /*!< in/out: index to be updated. */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+void
+dict_index_zip_failure(
+/*===================*/
+ dict_index_t* index) /*!< in/out: index to be updated. */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page may not compress*/
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+ dict_index_t* index) /*!< in: index for which page size
+ is requested */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name */
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+ ulint table_flag); /*!< in: row format setting */
+
+/** encode number of columns and number of virtual columns in one
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in] n_col number of non-virtual column
+@param[in] n_v_col number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+ ulint n_col,
+ ulint n_v_col);
+
+/** Decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in] encoded encoded value
+@param[in,out] n_col number of non-virtual column
+@param[in,out] n_v_col number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+ ulint encoded,
+ ulint* n_col,
+ ulint* n_v_col);
+
+/** Free the virtual column template
+@param[in,out] vc_templ virtual column template */
+UNIV_INLINE
+void
+dict_free_vc_templ(
+ dict_vcol_templ_t* vc_templ);
+
+/** Check whether the table have virtual index.
+@param[in] table InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+ dict_table_t* table);
+
+#include "dict0dict.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
new file mode 100644
index 00000000..4cc3eae9
--- /dev/null
+++ b/storage/innobase/include/dict0dict.inl
@@ -0,0 +1,1217 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0sysspace.h"
+#include "dict0pagecompress.h"
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return col->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return col->mbmaxlen;
+}
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type) /*!< out: data type */
+{
+ ut_ad(col != NULL);
+ ut_ad(type != NULL);
+
+ type->mtype = col->mtype;
+ type->prtype = col->prtype;
+ type->len = col->len;
+ type->mbminlen = col->mbminlen;
+ type->mbmaxlen = col->mbmaxlen;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(col->mtype == type->mtype);
+ ut_ad(col->prtype == type->prtype);
+ //ut_ad(col->len == type->len);
+ ut_ad(col->mbminlen == type->mbminlen);
+ ut_ad(col->mbmaxlen == type->mbmaxlen);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_max_size_low(col->mtype, col->len));
+}
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index) /*!< in: clustered index */
+{
+ ulint i;
+
+ ut_ad(dict_index_is_clust(clust_index));
+
+ for (i = 0; i < clust_index->n_def; i++) {
+ const dict_field_t* field = &clust_index->fields[i];
+
+ if (!field->prefix_len && field->col == col) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Gets the column position in the given index.
+@param[in] col table column
+@param[in] index index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+ const dict_col_t* col,
+ const dict_index_t* index)
+{
+ ulint i;
+
+ for (i = 0; i < index->n_def; i++) {
+ const dict_field_t* field = &index->fields[i];
+
+ if (!field->prefix_len && field->col == col) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table))
+ ->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ /* n_cols counts stored columns only. A table may contain
+ virtual columns and no user-specified stored columns at all. */
+ ut_ad(table->n_cols >= DATA_N_SYS_COLS);
+ return unsigned(table->n_cols) - DATA_N_SYS_COLS;
+}
+
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of non-virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return(table->n_cols);
+}
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in] table the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+ const dict_table_t* table)
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_v_cols);
+}
+
+/** Check if a table has indexed virtual columns
+@param[in] table the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+ const dict_table_t* table)
+{
+
+ for (unsigned i = 0; i < table->n_v_cols; i++) {
+ const dict_v_col_t* col = dict_table_get_nth_v_col(table, i);
+ if (col->m_col.ord_part) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->stat_initialized);
+
+ return(table->stat_n_rows);
+}
+
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ if (table->stat_initialized) {
+ ib_uint64_t n_rows = table->stat_n_rows;
+ if (n_rows < 0xFFFFFFFFFFFFFFFFULL) {
+ table->stat_n_rows = n_rows + 1;
+ }
+ }
+}
+
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ if (table->stat_initialized) {
+ ib_uint64_t n_rows = table->stat_n_rows;
+ if (n_rows > 0) {
+ table->stat_n_rows = n_rows - 1;
+ }
+ }
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos) /*!< in: position of column */
+{
+ ut_ad(pos < table->n_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return((dict_col_t*) (table->cols) + pos);
+}
+
+/** Gets the nth virtual column of a table.
+@param[in] table table
+@param[in] pos position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+ const dict_table_t* table,
+ ulint pos)
+{
+ ut_ad(table);
+ ut_ad(pos < table->n_v_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!table->v_cols[pos].m_col.is_added());
+ ut_ad(!table->v_cols[pos].m_col.is_dropped());
+ return &table->v_cols[pos];
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+{
+ dict_col_t* col;
+ col = dict_table_get_nth_col(table,
+ dict_table_get_sys_col_no(table, sys));
+ ut_ad(col->mtype == DATA_SYS);
+ ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+ return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+{
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+ /* out: TRUE if table has an FTS index */
+ dict_table_t* table) /* in: table */
+{
+ return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/** Validate the flags for tables that are not ROW_FORMAT=REDUNDANT.
+@param[in] flags table flags
+@return whether the flags are valid */
+inline
+bool
+dict_tf_is_valid_not_redundant(ulint flags)
+{
+ const bool atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+
+ ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+
+ if (!zip_ssize) {
+ /* Not ROW_FORMAT=COMPRESSED */
+ } else if (!atomic_blobs) {
+ /* ROW_FORMAT=COMPRESSED implies ROW_FORMAT=DYNAMIC
+ for the uncompressed page format */
+ return(false);
+ } else if (zip_ssize > PAGE_ZIP_SSIZE_MAX
+ || zip_ssize > srv_page_size_shift
+ || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) {
+ /* KEY_BLOCK_SIZE is out of bounds, or
+ ROW_FORMAT=COMPRESSED is not supported with this
+ innodb_page_size (only up to 16KiB) */
+ return(false);
+ }
+
+ switch (DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)) {
+ case 0:
+ /* PAGE_COMPRESSION_LEVEL=0 should imply PAGE_COMPRESSED=NO */
+ return(!DICT_TF_GET_PAGE_COMPRESSION(flags));
+ case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9:
+ /* PAGE_COMPRESSION_LEVEL requires
+ ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC
+ (not ROW_FORMAT=COMPRESSED or ROW_FORMAT=REDUNDANT)
+ and PAGE_COMPRESSED=YES */
+ return(!zip_ssize && DICT_TF_GET_PAGE_COMPRESSION(flags));
+ default:
+ /* Invalid PAGE_COMPRESSION_LEVEL value */
+ return(false);
+ }
+}
+
+/** Validate the table flags.
+@param[in] flags Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+ ulint flags)
+{
+ ut_ad(flags < 1U << DICT_TF_BITS);
+ /* The DATA_DIRECTORY flag can be assigned fully independently
+ of all other persistent table flags. */
+ flags &= ~DICT_TF_MASK_DATA_DIR;
+ if (!(flags & 1)) {
+ /* Only ROW_FORMAT=REDUNDANT has 0 in the least significant
+ bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag
+ (which we cleared above) can be set. If any other flags
+ are set, the flags are invalid. */
+ return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK);
+ }
+
+ return(dict_tf_is_valid_not_redundant(flags));
+}
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in] flags Table flags
+@param[in] flags2 Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+ ulint flags,
+ ulint flags2)
+{
+ if (!dict_tf_is_valid(flags)) {
+ return(false);
+ }
+
+ if ((flags2 & DICT_TF2_UNUSED_BIT_MASK) != 0) {
+ return(false);
+ }
+
+ return(true);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+ ulint flags) /*!< in: dict_table_t::flags */
+{
+ ut_a(dict_tf_is_valid(flags));
+
+ if (!DICT_TF_GET_COMPACT(flags)) {
+ return(REC_FORMAT_REDUNDANT);
+ }
+
+ if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+ return(REC_FORMAT_COMPACT);
+ }
+
+ if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+ return(REC_FORMAT_COMPRESSED);
+ }
+
+ return(REC_FORMAT_DYNAMIC);
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out] flags, Pointer to a 4 byte Table Flags
+@param[in] format File Format
+@param[in] zip_ssize Zip Shift Size
+@param[in] use_data_dir Table uses DATA DIRECTORY
+@param[in] page_compressed Table uses page compression
+@param[in] page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+/*========*/
+ ulint* flags,
+ rec_format_t format,
+ ulint zip_ssize,
+ bool use_data_dir,
+ bool page_compressed,
+ ulint page_compression_level)
+{
+ *flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0;
+
+ switch (format) {
+ case REC_FORMAT_REDUNDANT:
+ ut_ad(zip_ssize == 0);
+ /* no other options are allowed */
+ ut_ad(!page_compressed);
+ return;
+ case REC_FORMAT_COMPACT:
+ *flags |= DICT_TF_COMPACT;
+ ut_ad(zip_ssize == 0);
+ break;
+ case REC_FORMAT_COMPRESSED:
+ *flags |= DICT_TF_COMPACT
+ | (1 << DICT_TF_POS_ATOMIC_BLOBS)
+ | (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+ break;
+ case REC_FORMAT_DYNAMIC:
+ *flags |= DICT_TF_COMPACT
+ | (1 << DICT_TF_POS_ATOMIC_BLOBS);
+ ut_ad(zip_ssize == 0);
+ break;
+ }
+
+ if (page_compressed) {
+ *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS)
+ | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+ | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+ ut_ad(zip_ssize == 0);
+ ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+ ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+ }
+}
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+ | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags | 0 | 1 | 1 | 1
+fil_space_t::flags | 0 | 0 | 1 | 1
+==================================================================
+@param[in] table_flags dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags)
+{
+ uint32_t fsp_flags;
+ uint32_t page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(
+ table_flags);
+
+ ut_ad((DICT_TF_GET_PAGE_COMPRESSION(table_flags) == 0)
+ == (page_compression_level == 0));
+
+ DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return UINT32_MAX;);
+
+ /* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */
+ if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+ || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32)
+ && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) {
+
+ fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE();
+
+ if (page_compression_level) {
+ fsp_flags |= static_cast<uint32_t>(
+ innodb_compression_algorithm)
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+ }
+ } else {
+ /* Adjust bit zero. */
+ fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0;
+
+ /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */
+ fsp_flags |= table_flags
+ & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS);
+
+ fsp_flags |= FSP_FLAGS_PAGE_SSIZE();
+
+ if (page_compression_level) {
+ fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION;
+ }
+ }
+
+ ut_a(fil_space_t::is_valid_flags(fsp_flags, false));
+
+ if (DICT_TF_HAS_DATA_DIR(table_flags)) {
+ fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR;
+ }
+
+ fsp_flags |= page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+
+ return(fsp_flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit. Other bits are the same.
+========================= Low order bit ==========================
+ | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags | 0 | 1 | 1
+SYS_TABLES.TYPE | 1 | 1 | 1
+==================================================================
+@return ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+ ulint flags) /*!< in: dict_table_t::flags */
+{
+ ulint type;
+
+ ut_a(dict_tf_is_valid(flags));
+
+ /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
+ type = 1;
+
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL are the same. */
+ type |= flags & (DICT_TF_MASK_ZIP_SSIZE
+ | DICT_TF_MASK_ATOMIC_BLOBS
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_NO_ROLLBACK);
+
+ return(type);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+ return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_index_get_n_unique(index));
+ }
+
+ return(dict_index_get_n_fields(index));
+}
+
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in] index index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+ const dict_index_t* index)
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (dict_index_is_spatial(index)) {
+ /* For spatial index, on non-leaf page, we have only
+ 2 fields(mbr+page_no). So, except page no field,
+ there's one field there. */
+ return(DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+ } else {
+ return(dict_index_get_n_unique_in_tree(index));
+ }
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of field */
+{
+ ut_ad(pos < index->n_def);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field) /*!< in: index field */
+{
+ return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+{
+ return(dict_index_get_nth_col_or_prefix_pos(index, n, false, false,
+ prefix_col_pos));
+}
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ unsigned n= dict_index_get_n_fields(index);
+ unsigned size= 0;
+
+ while (n--)
+ size+= dict_col_get_min_size(dict_index_get_nth_col(index, n));
+
+ return size;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->page);
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+ return(srv_page_size / 16);
+}
+
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+ const dict_index_t* index) /*!< in: secondary index */
+{
+ enum online_index_status status;
+
+ status = (enum online_index_status) index->online_status;
+
+ /* Without the index->lock protection, the online
+ status can change from ONLINE_INDEX_CREATION to
+ ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in
+ row_log_apply() once log application is done. So to make
+ sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE
+ you should always do the recheck after acquiring index->lock */
+
+#ifdef UNIV_DEBUG
+ switch (status) {
+ case ONLINE_INDEX_COMPLETE:
+ case ONLINE_INDEX_CREATION:
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ return(status);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(status);
+}
+
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+ dict_index_t* index, /*!< in/out: index */
+ enum online_index_status status) /*!< in: status */
+{
+ ut_ad(!(index->type & DICT_FTS));
+ ut_ad(index->lock.have_x());
+
+#ifdef UNIV_DEBUG
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_COMPLETE:
+ case ONLINE_INDEX_CREATION:
+ break;
+ case ONLINE_INDEX_ABORTED:
+ ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED);
+ break;
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ index->online_status = status & 3;
+ ut_ad(dict_index_get_online_status(index) == status);
+}
+
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+ const dict_index_t* index) /*!< in: index */
+{
+#ifdef UNIV_DEBUG
+ if (dict_index_is_clust(index)) {
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_CREATION:
+ return(true);
+ case ONLINE_INDEX_COMPLETE:
+ return(false);
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ break;
+ }
+ ut_ad(0);
+ return(false);
+ }
+#endif /* UNIV_DEBUG */
+
+ return(UNIV_UNLIKELY(dict_index_get_online_status(index)
+ != ONLINE_INDEX_COMPLETE));
+}
+
+/**********************************************************************//**
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+ ib_vector_t* indexes,/*!< in: vector containing only FTS indexes */
+ ulint col_no, /*!< in: col number to search for */
+ bool is_virtual) /*!< in: whether it is a virtual column */
+
+{
+ ulint i;
+
+ for (i = 0; i < ib_vector_size(indexes); ++i) {
+ dict_index_t* index;
+
+ index = (dict_index_t*) ib_vector_getp(indexes, i);
+
+ if (index->contains_col_or_prefix(col_no, is_virtual)) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+ dict_table_t* table, /*!< in: table */
+ const dict_col_t* col) /*!< in: column which index prefix
+ is based on */
+{
+ if (!dict_table_has_atomic_blobs(table)) {
+ return(0);
+ }
+
+ if (col->max_prefix != 0) {
+ return(col->max_prefix);
+ }
+
+ return(REC_VERSION_56_MAX_INDEX_COL_LEN);
+}
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in] table dict_table_t for the table
+@param[in] col_no virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+ dict_table_t* table,
+ ulint col_no)
+{
+ const dict_col_t* col
+ = &dict_table_get_nth_v_col(table, col_no)->m_col;
+ ulint max_log_len;
+
+ /* This calculation conforms to the non-virtual column
+ maximum log length calculation:
+ 1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN
+ 2) if atomic BLOB, upto col->max_prefix or
+ REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */
+ if (dict_table_has_atomic_blobs(table)) {
+ if (DATA_BIG_COL(col) && col->max_prefix > 0) {
+ max_log_len = col->max_prefix;
+ } else {
+ max_log_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table);
+ }
+ } else {
+ max_log_len = REC_ANTELOPE_MAX_INDEX_COL_LEN;
+ }
+
+ return(max_log_len);
+}
+
+/** Check if the table is found is a file_per_table tablespace.
+This test does not use table flags2 since some REDUNDANT tables in the
+system tablespace may have garbage in the MIX_LEN field where flags2 is
+stored. These garbage MIX_LEN fields were written before v3.23.52.
+A patch was added to v3.23.52 which initializes the MIX_LEN field to 0.
+Since file-per-table tablespaces were added in 4.1, any SYS_TABLES
+record with a non-zero space ID will have a reliable MIX_LEN field.
+However, this test does not use flags2 from SYS_TABLES.MIX_LEN. Instead,
+assume that if the tablespace is not a predefined system tablespace,
+ then it must be file-per-table.
+Also, during ALTER TABLE, the DICT_TF2_USE_FILE_PER_TABLE flag may not be
+set on one of the file-per-table tablespaces.
+This test cannot be done on a table in the process of being created
+because the space_id will be zero until the tablespace is created.
+@param[in] table An existing open table to check
+@return true if this table was created as a file-per-table tablespace. */
+UNIV_INLINE
+bool
+dict_table_is_file_per_table(
+ const dict_table_t* table) /*!< in: table to check */
+{
+ return table->space != fil_system.sys_space
+ && table->space != fil_system.temp_space;
+}
+
+/** Acquire the table handle. */
+inline void dict_table_t::acquire()
+{
+ ut_ad(dict_sys.frozen());
+ n_ref_count++;
+}
+
+/** Release the table handle.
+@return whether the last handle was released */
+inline
+bool
+dict_table_t::release()
+{
+ auto n = n_ref_count--;
+ ut_ad(n > 0);
+ return n == 1;
+}
+
+/** Encode the number of columns and number of virtual columns in a
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in] n_col number of non-virtual column
+@param[in] n_v_col number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+ ulint n_col,
+ ulint n_v_col)
+{
+ return(n_col + (n_v_col<<16));
+}
+
+/** decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in] encoded encoded value
+@param[in,out] n_col number of non-virtual column
+@param[in,out] n_v_col number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+ ulint encoded,
+ ulint* n_col,
+ ulint* n_v_col)
+{
+
+ ulint num = encoded & ~DICT_N_COLS_COMPACT;
+ *n_v_col = num >> 16;
+ *n_col = num & 0xFFFF;
+}
+
+/** Free the virtual column template
+@param[in,out] vc_templ virtual column template */
+void
+dict_free_vc_templ(
+ dict_vcol_templ_t* vc_templ)
+{
+ UT_DELETE_ARRAY(vc_templ->default_rec);
+ vc_templ->default_rec = NULL;
+
+ if (vc_templ->vtempl != NULL) {
+ ut_ad(vc_templ->n_v_col > 0);
+ for (ulint i = 0; i < vc_templ->n_col
+ + vc_templ->n_v_col; i++) {
+ if (vc_templ->vtempl[i] != NULL) {
+ ut_free(vc_templ->vtempl[i]);
+ }
+ }
+ ut_free(vc_templ->vtempl);
+ vc_templ->vtempl = NULL;
+ }
+}
+
+/** Check whether the table have virtual index.
+@param[in] table InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+ dict_table_t* table)
+{
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
new file mode 100644
index 00000000..f7d33d5b
--- /dev/null
+++ b/storage/innobase/include/dict0load.h
@@ -0,0 +1,220 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "btr0types.h"
+
+#include <deque>
+
+/** A stack of table names related through foreign key constraints */
+typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t;
+
+/** Check each tablespace found in the data dictionary.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id();
+
+/** Make sure the data_file_name is saved in dict_table_t if needed.
+@param[in,out] table Table object */
+void dict_get_and_save_data_dir_path(dict_table_t* table);
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ table_id_t table_id, /*!< in: table id */
+ dict_err_ignore_t ignore_err); /*!< in: errors to ignore
+ when loading the table */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table); /*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary.
+
+The foreign key constraint is loaded only if the referenced table is also
+in the dictionary cache. If the referenced table is not in dictionary
+cache, then it is added to the output parameter (fk_tables).
+
+@return DB_SUCCESS or error code */
+dberr_t
+dict_load_foreigns(
+/*===============*/
+ const char* table_name, /*!< in: table name */
+ const char** col_names, /*!< in: column names, or NULL
+ to use table->col_names */
+ trx_id_t trx_id, /*!< in: DDL transaction id,
+ or 0 to check
+ recursive load of tables
+ chained by FK */
+ bool check_charsets, /*!< in: whether to check
+ charset compatibility */
+ dict_err_ignore_t ignore_err, /*!< in: error to be ignored */
+ dict_names_t& fk_tables) /*!< out: stack of table names
+ which must be loaded
+ subsequently to load all the
+ foreign key constraints. */
+ MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+This function opens a system table, and return the first record.
+@return first record of the system table */
+const rec_t*
+dict_startscan_system(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor to
+ the record */
+ mtr_t* mtr, /*!< in: the mini-transaction */
+ dict_table_t* table); /*!< in: system table */
+/********************************************************************//**
+This function get the next system table record as we scan the table.
+@return the record if found, NULL if end of scan. */
+const rec_t*
+dict_getnext_system(
+/*================*/
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor
+ to the record */
+ mtr_t* mtr); /*!< in: the mini-transaction */
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in,out] mtr mini-transaction
+@param[in] uncommitted whether to use READ UNCOMMITTED isolation level
+@param[in] rec SYS_TABLES record
+@param[out,own] table table, or nullptr
+@return error message
+@retval nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+ const rec_t *rec, dict_table_t **table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_INDEXES rec */
+ dict_index_t* index, /*!< out: dict_index_t to be
+ filled */
+ table_id_t* table_id); /*!< out: table id */
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_COLUMNS rec */
+ dict_col_t* column, /*!< out: dict_col_t to be filled */
+ table_id_t* table_id, /*!< out: table id */
+ const char** col_name, /*!< out: column name */
+ ulint* nth_v_col); /*!< out: if virtual col, this is
+ records its sequence number */
+
+/** This function parses a SYS_VIRTUAL record and extract virtual column
+information
+@param[in,out] heap heap memory
+@param[in] rec current SYS_COLUMNS rec
+@param[in,out] table_id table id
+@param[in,out] pos virtual column position
+@param[in,out] base_pos base column position
+@return error message, or NULL on success */
+const char*
+dict_process_sys_virtual_rec(
+ const rec_t* rec,
+ table_id_t* table_id,
+ ulint* pos,
+ ulint* base_pos);
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populate a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FIELDS rec */
+ dict_field_t* sys_field, /*!< out: dict_field_t to be
+ filled */
+ ulint* pos, /*!< out: Field position */
+ index_id_t* index_id, /*!< out: current index id */
+ index_id_t last_id); /*!< in: previous index id */
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FOREIGN rec */
+ dict_foreign_t* foreign); /*!< out: dict_foreign_t to be
+ filled */
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */
+ const char** name, /*!< out: foreign key constraint name */
+ const char** for_col_name, /*!< out: referencing column name */
+ const char** ref_col_name, /*!< out: referenced column name
+ in referenced table */
+ ulint* pos); /*!< out: column position */
+
+#endif
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
new file mode 100644
index 00000000..fde2a714
--- /dev/null
+++ b/storage/innobase/include/dict0mem.h
@@ -0,0 +1,2649 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "sux_lock.h"
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "fts0fts.h"
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+#include "gis0type.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mysql_com.h"
+#include <sql_const.h>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include <ostream>
+#include <mutex>
+
+/* Forward declaration. */
+struct ib_rbt_t;
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED 1 /*!< clustered index; for other than
+ auto-generated clustered indexes,
+ also DICT_UNIQUE will be set */
+#define DICT_UNIQUE 2 /*!< unique index */
+#define DICT_IBUF 8 /*!< insert buffer tree */
+#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag
+ in SYS_INDEXES.TYPE */
+#define DICT_FTS 32 /* FTS index; can't be combined with the
+ other flags */
+#define DICT_SPATIAL 64 /* SPATIAL index; can't be combined with the
+ other flags */
+#define DICT_VIRTUAL 128 /* Index on Virtual column */
+
+#define DICT_IT_BITS 8 /*!< number of bits used for
+ SYS_INDEXES.TYPE */
+/* @} */
+
+#if 0 /* not implemented, retained for history */
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */
+#define DICT_TABLE_CLUSTER_MEMBER 2
+#define DICT_TABLE_CLUSTER 3 /* this means that the table is
+ really a cluster definition */
+#endif
+
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
+
+==================== Low order flags bit =========================
+ | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE | 1 | 1 | 1
+dict_table_t::flags | 0 | 1 | 1
+FSP_SPACE_FLAGS | 0 | 0 | 1
+fil_space_t::flags | 0 | 0 | 1
+
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
+
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */
+/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT 1U /*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+constexpr uint32_t DICT_N_COLS_COMPACT= 1U << 31;
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT 1
+
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE 4
+
+/** Width of the ATOMIC_BLOBS flag. The ROW_FORMAT=REDUNDANT and
+ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes
+in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED
+store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS 1
+
+/** If a table is created with the MYSQL option DATA DIRECTORY and
+innodb-file-per-table, an older engine will not be able to find that table.
+This flag prevents older engines from attempting to open the table and
+allows InnoDB to update_create_info() accordingly. */
+#define DICT_TF_WIDTH_DATA_DIR 1
+
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION 1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for
+ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3)
+*/
+#define DICT_TF_WIDTH_NO_ROLLBACK 2
+
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \
+ + DICT_TF_WIDTH_ZIP_SSIZE \
+ + DICT_TF_WIDTH_ATOMIC_BLOBS \
+ + DICT_TF_WIDTH_DATA_DIR \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT 0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \
+ + DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \
+ + DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the DATA_DIR field */
+#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \
+ + DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \
+ + DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the NO_ROLLBACK field */
+#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
+#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \
+ + DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT \
+ ((~(~0U << DICT_TF_WIDTH_COMPACT)) \
+ << DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE \
+ ((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE)) \
+ << DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS \
+ ((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS)) \
+ << DICT_TF_POS_ATOMIC_BLOBS)
+/** Bit mask of the DATA_DIR field */
+#define DICT_TF_MASK_DATA_DIR \
+ ((~(~0U << DICT_TF_WIDTH_DATA_DIR)) \
+ << DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION \
+ ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+ << DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \
+ ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the NO_ROLLBACK field */
+#define DICT_TF_MASK_NO_ROLLBACK \
+ ((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \
+ << DICT_TF_POS_NO_ROLLBACK)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags) \
+ ((flags & DICT_TF_MASK_COMPACT) \
+ >> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags) \
+ ((flags & DICT_TF_MASK_ZIP_SSIZE) \
+ >> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \
+ ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \
+ >> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the value of the DATA_DIR field */
+#define DICT_TF_HAS_DATA_DIR(flags) \
+ ((flags & DICT_TF_MASK_DATA_DIR) \
+ >> DICT_TF_POS_DATA_DIR)
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+ >> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \
+ >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+
+/* @} */
+
+/** @brief Table Flags set number 2.
+
+These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags
+will be written as 0. The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
+/* @{ */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS 7
+#define DICT_TF2_UNUSED_BIT_MASK (~0U << DICT_TF2_BITS)
+#define DICT_TF2_BIT_MASK ~DICT_TF2_UNUSED_BIT_MASK
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY 1U
+
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID 2U
+
+/** The table has an FTS index */
+#define DICT_TF2_FTS 4U
+
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID 8U
+
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_FILE_PER_TABLE 16U
+
+/** Set when we discard/detach the tablespace */
+#define DICT_TF2_DISCARDED 32U
+
+/** This bit is set if all aux table names (both common tables and
+index tables) of a FTS table are in HEX format. */
+#define DICT_TF2_FTS_AUX_HEX_NAME 64U
+
+/* @} */
+
+#define DICT_TF2_FLAG_SET(table, flag) \
+ (table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag) \
+ (table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag) \
+ (table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1))
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD 20
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL 15
+
+/****************************************************************/ /**
+ Free a table memory object. */
+void
+dict_mem_table_free(
+/*================*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
+ const char* name, /*!< in: column name, or NULL */
+ ulint mtype, /*!< in: main datatype */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision */
+ MY_ATTRIBUTE((nonnull(1)));
+/** Adds a virtual column definition to a table.
+@param[in,out] table table
+@param[in] heap temporary memory heap, or NULL. It is
+ used to store name when we have not finished
+ adding all columns. When all columns are
+ added, the whole name will copy to memory from
+ table->heap
+@param[in] name column name
+@param[in] mtype main datatype
+@param[in] prtype precise type
+@param[in] len length
+@param[in] pos position in a table
+@param[in] num_base number of base columns
+@return the virtual column definition */
+dict_v_col_t*
+dict_mem_table_add_v_col(
+ dict_table_t* table,
+ mem_heap_t* heap,
+ const char* name,
+ ulint mtype,
+ ulint prtype,
+ ulint len,
+ ulint pos,
+ ulint num_base);
+
+/** Adds a stored column definition to a table.
+@param[in] table table
+@param[in] num_base number of base columns. */
+void
+dict_mem_table_add_s_col(
+ dict_table_t* table,
+ ulint num_base);
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+void
+dict_mem_table_col_rename(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint nth_col,/*!< in: column index */
+ const char* from, /*!< in: old column name */
+ const char* to, /*!< in: new column name */
+ bool is_virtual);
+ /*!< in: if this is a virtual column */
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+void
+dict_mem_fill_column_struct(
+/*========================*/
+ dict_col_t* column, /*!< out: column struct to be
+ filled */
+ ulint col_pos, /*!< in: column position */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint col_len); /*!< in: column length */
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+ dict_index_t* index, /*!< out: index to be filled */
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /*!< in: number of fields */
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ dict_table_t* table, /*!< in: table */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /*!< in: number of fields */
+
+/**********************************************************************//**
+Frees an index memory object. */
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name. If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+ dict_foreign_t* foreign, /*!< in/out: foreign struct */
+ ibool do_alloc); /*!< in: is an alloc needed */
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name. If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+ dict_foreign_t* foreign, /*!< in/out: foreign struct */
+ ibool do_alloc); /*!< in: is an alloc needed */
+
+/** Fills the dependent virtual columns in a set.
+Reason for being dependent are
+1) FK can be present on base column of virtual columns
+2) FK can be present on column which is a part of virtual index
+@param[in,out] foreign foreign key information. */
+void
+dict_mem_foreign_fill_vcol_set(
+ dict_foreign_t* foreign);
+
+/** Fill virtual columns set in each fk constraint present in the table.
+@param[in,out] table innodb table object. */
+void
+dict_mem_table_fill_foreign_vcol_set(
+ dict_table_t* table);
+
+/** Free the vcol_set from all foreign key constraint on the table.
+@param[in,out] table innodb table object. */
+void
+dict_mem_table_free_foreign_vcol_set(
+ dict_table_t* table);
+
+/** Create a temporary tablename like "#sql-ibNNN".
+@param[in] heap A memory heap
+@param[in] dbtab Table name in the form database/table name
+@param[in] id Table id
+@return A unique temporary tablename suitable for InnoDB use */
+char*
+dict_mem_create_temporary_tablename(
+ mem_heap_t* heap,
+ const char* dbtab,
+ table_id_t id);
+
+/** SQL identifier name wrapper for pretty-printing */
+class id_name_t
+{
+public:
+ /** Default constructor */
+ id_name_t()
+ : m_name()
+ {}
+ /** Constructor
+ @param[in] name identifier to assign */
+ explicit id_name_t(
+ const char* name)
+ : m_name(name)
+ {}
+
+ /** Assignment operator
+ @param[in] name identifier to assign */
+ id_name_t& operator=(
+ const char* name)
+ {
+ m_name = name;
+ return(*this);
+ }
+
+ /** Implicit type conversion
+ @return the name */
+ operator const char*() const
+ {
+ return(m_name);
+ }
+
+ /** Explicit type conversion
+ @return the name */
+ const char* operator()() const
+ {
+ return(m_name);
+ }
+
+private:
+ /** The name in internal representation */
+ const char* m_name;
+};
+
+/** Data structure for a column in a table */
+struct dict_col_t{
+ /*----------------------*/
+ /** The following are copied from dtype_t,
+ so that all bit-fields can be packed tightly. */
+ /* @{ */
+ unsigned prtype:32; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+ unsigned mtype:8; /*!< main data type */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+
+ unsigned mbminlen:3; /*!< minimum length of a
+ character, in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a
+ character, in bytes */
+ /*----------------------*/
+ /* End of definitions copied from dtype_t */
+ /* @} */
+
+ unsigned ind:10; /*!< table column position
+ (starting from 0) */
+ unsigned ord_part:1; /*!< nonzero if this column
+ appears in the ordering fields
+ of an index */
+ unsigned max_prefix:12; /*!< maximum index prefix length on
+ this column. Our current max limit is
+ 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN)
+ bytes. */
+private:
+ /** Special value of ind for a dropped column */
+ static const unsigned DROPPED = 1023;
+public:
+
+ /** Detach a virtual column from an index.
+ @param index being-freed index */
+ inline void detach(const dict_index_t &index);
+
+ /** Data for instantly added columns */
+ struct def_t
+ {
+ /** original default value of instantly added column */
+ const void *data;
+ /** len of data, or UNIV_SQL_DEFAULT if unavailable */
+ ulint len;
+ } def_val;
+
+ /** Retrieve the column name.
+ @param table the table of this column */
+ const char *name(const dict_table_t &table) const;
+
+ /** @return whether this is a virtual column */
+ bool is_virtual() const { return prtype & DATA_VIRTUAL; }
+ /** @return whether NULL is an allowed value for this column */
+ bool is_nullable() const { return !(prtype & DATA_NOT_NULL); }
+
+ /** @return whether table of this system field is TRX_ID-based */
+ bool vers_native() const
+ {
+ ut_ad(vers_sys_start() || vers_sys_end());
+ ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY);
+ return mtype == DATA_INT;
+ }
+ /** @return whether this user column (not row_start, row_end)
+ has System Versioning property */
+ bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+ /** @return whether this is the system version start */
+ bool vers_sys_start() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+ }
+ /** @return whether this is the system version end */
+ bool vers_sys_end() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+ }
+
+ /** @return whether this is an instantly-added column */
+ bool is_added() const
+ {
+ DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data);
+ return def_val.len != UNIV_SQL_DEFAULT;
+ }
+ /** Flag the column instantly dropped */
+ void set_dropped() { ind = DROPPED; }
+ /** Flag the column instantly dropped.
+ @param not_null whether the column was NOT NULL
+ @param len2 whether the length exceeds 255 bytes
+ @param fixed_len the fixed length in bytes, or 0 */
+ void set_dropped(bool not_null, bool len2, unsigned fixed)
+ {
+ DBUG_ASSERT(!len2 || !fixed);
+ prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE;
+ if (fixed)
+ {
+ mtype= DATA_FIXBINARY;
+ len= static_cast<uint16_t>(fixed);
+ }
+ else
+ {
+ mtype= DATA_BINARY;
+ len= len2 ? 65535 : 255;
+ }
+ mbminlen= mbmaxlen= 0;
+ ind= DROPPED;
+ ord_part= 0;
+ max_prefix= 0;
+ }
+ /** @return whether the column was instantly dropped */
+ bool is_dropped() const { return ind == DROPPED; }
+ /** @return whether the column was instantly dropped
+ @param index the clustered index */
+ inline bool is_dropped(const dict_index_t &index) const;
+
+ /** Get the default value of an instantly-added column.
+ @param[out] len value length (in bytes), or UNIV_SQL_NULL
+ @return default value
+ @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+ const byte *instant_value(ulint *len) const
+ {
+ DBUG_ASSERT(is_added());
+ *len= def_val.len;
+ return static_cast<const byte*>(def_val.data);
+ }
+
+ /** Remove the 'instant ADD' status of the column */
+ void clear_instant()
+ {
+ def_val.len= UNIV_SQL_DEFAULT;
+ def_val.data= NULL;
+ }
+
+ /** @return whether two columns have compatible data type encoding */
+ bool same_type(const dict_col_t &other) const
+ {
+ if (mtype != other.mtype)
+ {
+ /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR
+ will be used instead of DATA_MYSQL and DATA_VARMYSQL.
+ As long as mtype,prtype are being written to InnoDB
+ data dictionary tables, we cannot simplify this. */
+ switch (mtype) {
+ default:
+ return false;
+ case DATA_VARCHAR:
+ if (other.mtype != DATA_VARMYSQL)
+ return false;
+ goto check_encoding;
+ case DATA_VARMYSQL:
+ if (other.mtype != DATA_VARCHAR)
+ return false;
+ goto check_encoding;
+ case DATA_CHAR:
+ if (other.mtype != DATA_MYSQL)
+ return false;
+ goto check_encoding;
+ case DATA_MYSQL:
+ if (other.mtype != DATA_CHAR)
+ return false;
+ goto check_encoding;
+ }
+ }
+ else if (dtype_is_string_type(mtype))
+ {
+ check_encoding:
+ const uint16_t cset= dtype_get_charset_coll(prtype);
+ const uint16_t ocset= dtype_get_charset_coll(other.prtype);
+ return cset == ocset || dict_col_t::same_encoding(cset, ocset);
+ }
+
+ return true;
+ }
+
+ /** @return whether two collations codes have the same character encoding */
+ static bool same_encoding(uint16_t a, uint16_t b);
+
+ /** Determine if the columns have the same format
+ except for is_nullable() and is_versioned().
+ @param other column to compare to
+ @return whether the columns have the same format */
+ bool same_format(const dict_col_t &other) const
+ {
+ return same_type(other) && len >= other.len &&
+ mbminlen == other.mbminlen && mbmaxlen >= other.mbmaxlen &&
+ !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED |
+ CHAR_COLL_MASK << 16 |
+ DATA_LONG_TRUE_VARCHAR));
+ }
+
+ /** @return whether the column values are comparable by memcmp() */
+ bool is_binary() const { return prtype & DATA_BINARY_TYPE; }
+};
+
+/** Index information put in a list of virtual column structure. Index
+id and virtual column position in the index will be logged.
+There can be multiple entries for a given index, with a different position. */
+struct dict_v_idx_t {
+ /** active index on the column */
+ dict_index_t* index;
+
+ /** position in this index */
+ ulint nth_field;
+
+ dict_v_idx_t(dict_index_t* index, ulint nth_field)
+ : index(index), nth_field(nth_field) {}
+};
+
+/** Data structure for a virtual column in a table */
+struct dict_v_col_t{
+ /** column structure */
+ dict_col_t m_col;
+
+ /** array of base column ptr */
+ dict_col_t** base_col;
+
+ /** number of base column */
+ unsigned num_base:10;
+
+ /** column pos in table */
+ unsigned v_pos:10;
+
+ /** Virtual index list, and column position in the index */
+ std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> >
+ v_indexes;
+
+ /** Detach the column from an index.
+ @param index index to be detached from */
+ void detach(const dict_index_t &index)
+ {
+ if (v_indexes.empty()) return;
+ auto i= v_indexes.before_begin();
+ do {
+ auto prev = i++;
+ if (i == v_indexes.end())
+ {
+ return;
+ }
+ if (i->index == &index)
+ {
+ v_indexes.erase_after(prev);
+ return;
+ }
+ }
+ while (i != v_indexes.end());
+ }
+};
+
+/** Data structure for newly added virtual column in a index.
+It is used only during rollback_inplace_alter_table() of
+addition of index depending on newly added virtual columns
+and uses index heap. Should be freed when index is being
+removed from cache. */
+struct dict_add_v_col_info
+{
+ ulint n_v_col;
+ dict_v_col_t *v_col;
+
+ /** Add the newly added virtual column while rollbacking
+ the index which contains new virtual columns
+ @param col virtual column to be duplicated
+ @param offset offset where to duplicate virtual column */
+ dict_v_col_t* add_drop_v_col(mem_heap_t *heap, dict_v_col_t *col,
+ ulint offset)
+ {
+ ut_ad(n_v_col);
+ ut_ad(offset < n_v_col);
+ if (!v_col)
+ v_col= static_cast<dict_v_col_t*>
+ (mem_heap_alloc(heap, n_v_col * sizeof *v_col));
+ new (&v_col[offset]) dict_v_col_t();
+ v_col[offset].m_col= col->m_col;
+ v_col[offset].v_pos= col->v_pos;
+ return &v_col[offset];
+ }
+};
+
+/** Data structure for newly added virtual column in a table */
+struct dict_add_v_col_t{
+ /** number of new virtual column */
+ ulint n_v_col;
+
+ /** column structures */
+ const dict_v_col_t* v_col;
+
+ /** new col names */
+ const char** v_col_name;
+};
+
+/** Data structure for a stored column in a table. */
+struct dict_s_col_t {
+ /** Stored column ptr */
+ dict_col_t* m_col;
+ /** array of base col ptr */
+ dict_col_t** base_col;
+ /** number of base columns */
+ ulint num_base;
+ /** column pos in table */
+ ulint s_pos;
+};
+
+/** list to put stored column for create_table_info_t */
+typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> >
+dict_s_col_list;
+
+/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and
+is the maximum indexed column length (or indexed prefix length) in
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format,
+any fixed-length field that is longer than this will be encoded as
+a variable-length field.
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes. This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN
+
+/** Find out maximum indexed column length by its table format.
+For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
+#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \
+ (dict_table_has_atomic_blobs(table) \
+ ? REC_VERSION_56_MAX_INDEX_COL_LEN \
+ : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \
+ (DICT_TF_HAS_ATOMIC_BLOBS(flags) \
+ ? REC_VERSION_56_MAX_INDEX_COL_LEN \
+ : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+/** Defines the maximum fixed length column size */
+#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN
+
+#ifdef WITH_WSREP
+#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
+#endif /* WITH_WSREP */
+
+/** Data structure for a field in an index */
+struct dict_field_t{
+ dict_col_t* col; /*!< pointer to the table column */
+ id_name_t name; /*!< name of the column */
+ unsigned prefix_len:12; /*!< 0 or the length of the column
+ prefix in bytes in a MySQL index of
+ type, e.g., INDEX (textcol(25));
+ must be smaller than
+ DICT_MAX_FIELD_LEN_BY_FORMAT;
+ NOTE that in the UTF-8 charset, MySQL
+ sets this to (mbmaxlen * the prefix len)
+ in UTF-8 chars */
+ unsigned fixed_len:10; /*!< 0 or the fixed length of the
+ column if smaller than
+ DICT_ANTELOPE_MAX_INDEX_COL_LEN */
+ /** 1=DESC, 0=ASC */
+ unsigned descending:1;
+
+ /** Zero-initialize all fields */
+ dict_field_t() { memset((void*) this, 0, sizeof *this); }
+
+ /** Check whether two index fields are equivalent.
+ @param[in] old the other index field
+ @return whether the index fields are equivalent */
+ bool same(const dict_field_t& other) const
+ {
+ return(prefix_len == other.prefix_len
+ && fixed_len == other.fixed_len);
+ }
+};
+
+/**********************************************************************//**
+PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
+COMPRESSION FAILURES
+(Note: this is relevant only for compressed indexes)
+GOAL: Avoid compression failures by maintaining information about the
+compressibility of data. If data is not very compressible then leave
+some extra space 'padding' in the uncompressed page making it more
+likely that compression of less than fully packed uncompressed page will
+succeed.
+
+This padding heuristic works by increasing the pad linearly until the
+desired failure rate is reached. A "round" is a fixed number of
+compression operations.
+After each round, the compression failure rate for that round is
+computed. If the failure rate is too high, then padding is incremented
+by a fixed value, otherwise it's left intact.
+If the compression failure is lower than the desired rate for a fixed
+number of consecutive rounds, then the padding is decreased by a fixed
+value. This is done to prevent overshooting the padding value,
+and to accommodate the possible change in data compressibility. */
+
+/** Number of zip ops in one round. */
+#define ZIP_PAD_ROUND_LEN (128)
+
+/** Number of successful rounds after which the padding is decreased */
+#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5)
+
+/** Amount by which padding is increased. */
+#define ZIP_PAD_INCR (128)
+
+/** Percentage of compression failures that are allowed in a single
+round */
+extern ulong zip_failure_threshold_pct;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+extern ulong zip_pad_max;
+
+/** Data structure to hold information about about how much space in
+an uncompressed page should be left as padding to avoid compression
+failures. This estimate is based on a self-adapting heuristic. */
+struct zip_pad_info_t {
+ /** Dummy assignment operator for dict_index_t::clone() */
+ zip_pad_info_t &operator=(const zip_pad_info_t&) { return *this; }
+ std::mutex mutex; /*!< mutex protecting the info */
+ Atomic_relaxed<ulint>
+ pad; /*!< number of bytes used as pad */
+ ulint success;/*!< successful compression ops during
+ current round */
+ ulint failure;/*!< failed compression ops during
+ current round */
+ ulint n_rounds;/*!< number of currently successful
+ rounds */
+};
+
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
+
+/** Data structure for an index. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_t {
+ /** Columns whose character-set collation is being changed */
+ struct col_info
+ {
+ /** number of columns whose charset-collation is being changed */
+ unsigned n_cols;
+ /** columns with changed charset-collation */
+ dict_col_t *cols;
+
+ /** Add a column with changed collation. */
+ dict_col_t *add(mem_heap_t *heap, const dict_col_t &col, unsigned offset)
+ {
+ ut_ad(offset < n_cols);
+ if (!cols)
+ cols= static_cast<dict_col_t*>
+ (mem_heap_alloc(heap, n_cols * sizeof col));
+ new (&cols[offset]) dict_col_t(col);
+ return &cols[offset];
+ }
+ };
+
+ /** Maximum number of fields */
+ static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
+
+ index_id_t id; /*!< id of the index */
+ mem_heap_t* heap; /*!< memory heap */
+ id_name_t name; /*!< index name */
+ dict_table_t* table; /*!< back pointer to table */
+ /** root page number, or FIL_NULL if the index has been detached
+ from storage (DISCARD TABLESPACE or similar),
+ or 1 if the index is in table->freed_indexes */
+ unsigned page:32;
+ unsigned merge_threshold:6;
+ /*!< In the pessimistic delete, if the page
+ data size drops below this limit in percent,
+ merging it to a neighbor is tried */
+# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50
+ unsigned type:DICT_IT_BITS;
+ /*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+ DICT_IBUF, DICT_CORRUPT) */
+#define MAX_KEY_LENGTH_BITS 12
+ unsigned trx_id_offset:MAX_KEY_LENGTH_BITS;
+ /*!< position of the trx id column
+ in a clustered index record, if the fields
+ before it are known to be of a fixed size,
+ 0 otherwise */
+#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+#endif
+ unsigned n_user_defined_cols:10;
+ /*!< number of columns the user defined to
+ be in the index: in the internal
+ representation we add more columns */
+ unsigned nulls_equal:1;
+ /*!< if true, SQL NULL == SQL NULL */
+ unsigned n_uniq:10;/*!< number of fields from the beginning
+ which are enough to determine an index
+ entry uniquely */
+ unsigned n_def:10;/*!< number of fields defined so far */
+ unsigned n_fields:10;/*!< number of fields in the index */
+ unsigned n_nullable:10;/*!< number of nullable fields */
+ unsigned n_core_fields:10;/*!< number of fields in the index
+ (before the first time of instant add columns) */
+ /** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer
+ records; usually equal to UT_BITS_IN_BYTES(n_nullable), but
+ can be less in clustered indexes with instant ADD COLUMN */
+ unsigned n_core_null_bytes:8;
+ /** magic value signalling that n_core_null_bytes was not
+ initialized yet */
+ static const unsigned NO_CORE_NULL_BYTES = 0xff;
+ /** The clustered index ID of the hard-coded SYS_INDEXES table. */
+ static const unsigned DICT_INDEXES_ID = 3;
+ unsigned cached:1;/*!< TRUE if the index object is in the
+ dictionary cache */
+ unsigned to_be_dropped:1;
+ /*!< TRUE if the index is to be dropped;
+ protected by dict_sys.latch */
+ unsigned online_status:2;
+ /*!< enum online_index_status.
+ Transitions from ONLINE_INDEX_COMPLETE (to
+ ONLINE_INDEX_CREATION) are protected
+ by dict_sys.latch. Other changes are
+ protected by index->lock. */
+ unsigned uncommitted:1;
+ /*!< a flag that is set for secondary indexes
+ that have not been committed to the
+ data dictionary yet. Protected by
+ MDL */
+
+#ifdef UNIV_DEBUG
+ /** whether this is a dummy index object */
+ bool is_dummy;
+ /** whether btr_cur_instant_init() is in progress */
+ bool in_instant_init;
+ uint32_t magic_n;/*!< magic number */
+/** Value of dict_index_t::magic_n */
+# define DICT_INDEX_MAGIC_N 76789786
+#endif
+ dict_field_t* fields; /*!< array of field descriptions */
+ st_mysql_ftparser*
+ parser; /*!< fulltext parser plugin */
+
+ /** It just indicates whether newly added virtual column
+ during alter. It stores column in case of alter failure.
+ It should use heap from dict_index_t. It should be freed
+ while removing the index from table. */
+ dict_add_v_col_info* new_vcol_info;
+
+ /** During ALTER TABLE, columns that a being-added index depends on
+ and whose encoding or collation is being changed to something
+ that is compatible with the clustered index.
+ Allocated from dict_index_t::heap.
+
+ @see rollback_inplace_alter_table()
+ @see ha_innobase_inplace_ctx::col_collations */
+ col_info* change_col_info;
+
+ UT_LIST_NODE_T(dict_index_t)
+ indexes;/*!< list of indexes of the table */
+#ifdef BTR_CUR_ADAPT
+ btr_search_t* search_info;
+ /*!< info used in optimistic searches */
+#endif /* BTR_CUR_ADAPT */
+ row_log_t* online_log;
+ /*!< the log of modifications
+ during online index creation;
+ valid when online_status is
+ ONLINE_INDEX_CREATION */
+ /*----------------------*/
+ /** Statistics for query optimization */
+ /* @{ */
+ ib_uint64_t* stat_n_diff_key_vals;
+ /*!< approximate number of different
+ key values for this index, for each
+ n-column prefix where 1 <= n <=
+ dict_get_n_unique(index) (the array is
+ indexed from 0 to n_uniq-1); we
+ periodically calculate new
+ estimates */
+ ib_uint64_t* stat_n_sample_sizes;
+ /*!< number of pages that were sampled
+ to calculate each of stat_n_diff_key_vals[],
+ e.g. stat_n_sample_sizes[3] pages were sampled
+ to get the number stat_n_diff_key_vals[3]. */
+ ib_uint64_t* stat_n_non_null_key_vals;
+ /* approximate number of non-null key values
+ for this index, for each column where
+ 1 <= n <= dict_get_n_unique(index) (the array
+ is indexed from 0 to n_uniq-1); This
+ is used when innodb_stats_method is
+ "nulls_ignored". */
+ ulint stat_index_size;
+ /*!< approximate index size in
+ database pages */
+ ulint stat_n_leaf_pages;
+ /*!< approximate number of leaf pages in the
+ index tree */
+ bool stats_error_printed;
+ /*!< has persistent statistics error printed
+ for this index ? */
+ /* @} */
+ /** Statistics for defragmentation, these numbers are estimations and
+ could be very inaccurate at certain times, e.g. right after restart,
+ during defragmentation, etc. */
+ /* @{ */
+ ulint stat_defrag_modified_counter;
+ ulint stat_defrag_n_pages_freed;
+ /* number of pages freed by defragmentation. */
+ ulint stat_defrag_n_page_split;
+ /* number of page splits since last full index
+ defragmentation. */
+ ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+ /* data size when compression failure happened
+ the most recent 10 times. */
+ ulint stat_defrag_sample_next_slot;
+ /* in which slot the next sample should be
+ saved. */
+ /* @} */
+private:
+ /** R-tree split sequence number */
+ Atomic_relaxed<node_seq_t> rtr_ssn;
+public:
+ void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; }
+ node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; }
+ node_seq_t ssn() const { return rtr_ssn; }
+
+ rtr_info_track_t*
+ rtr_track;/*!< tracking all R-Tree search cursors */
+ trx_id_t trx_id; /*!< id of the transaction that created this
+ index, or 0 if the index existed
+ when InnoDB was started up */
+ zip_pad_info_t zip_pad;/*!< Information about state of
+ compression failures and successes */
+ /** lock protecting the non-leaf index pages */
+ mutable index_lock lock;
+
+ /** Determine if the index has been committed to the
+ data dictionary.
+ @return whether the index definition has been committed */
+ bool is_committed() const
+ {
+ ut_ad(!uncommitted || !(type & DICT_CLUSTERED));
+ return(UNIV_LIKELY(!uncommitted));
+ }
+
+ /** Flag an index committed or uncommitted.
+ @param[in] committed whether the index is committed */
+ void set_committed(bool committed)
+ {
+ ut_ad(!to_be_dropped);
+ ut_ad(committed || !(type & DICT_CLUSTERED));
+ ut_ad(!committed || !change_col_info);
+ uncommitted = !committed;
+ }
+
+ /** Notify that the index pages are going to be modified.
+ @param[in,out] mtr mini-transaction */
+ inline void set_modified(mtr_t& mtr) const;
+
+ /** @return whether this index is readable
+ @retval true normally
+ @retval false if this is a single-table tablespace
+ and the .ibd file is missing, or a
+ page cannot be read or decrypted */
+ inline bool is_readable() const;
+
+ /** @return whether instant ALTER TABLE is in effect */
+ inline bool is_instant() const;
+
+ /** @return whether the index is the primary key index
+ (not the clustered index of the change buffer) */
+ bool is_primary() const
+ {
+ return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF));
+ }
+
+ /** @return whether this is a generated clustered index */
+ bool is_gen_clust() const { return type == DICT_CLUSTERED; }
+
+ /** @return whether this is a clustered index */
+ bool is_clust() const { return type & DICT_CLUSTERED; }
+
+ /** @return whether this is a unique index */
+ bool is_unique() const { return type & DICT_UNIQUE; }
+
+ /** @return whether this is a spatial index */
+ bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); }
+
+ /** @return whether this is the change buffer */
+ bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
+
+ /** @return whether this index requires locking */
+ bool has_locking() const { return !is_ibuf(); }
+
+ /** @return whether this is a normal B-tree index
+ (not the change buffer, not SPATIAL or FULLTEXT) */
+ bool is_btree() const {
+ return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL
+ | DICT_FTS | DICT_CORRUPT)));
+ }
+
+ /** @return whether the index includes virtual columns */
+ bool has_virtual() const { return type & DICT_VIRTUAL; }
+
+ /** @return the position of DB_TRX_ID */
+ uint16_t db_trx_id() const {
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(n_uniq);
+ DBUG_ASSERT(n_uniq <= MAX_REF_PARTS);
+ return n_uniq;
+ }
+ /** @return the position of DB_ROLL_PTR */
+ uint16_t db_roll_ptr() const
+ {
+ return static_cast<uint16_t>(db_trx_id() + 1);
+ }
+
+ /** @return the offset of the metadata BLOB field,
+ or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
+ uint16_t first_user_field() const
+ {
+ return static_cast<uint16_t>(db_trx_id() + 2);
+ }
+
+ /** @return whether the index is corrupted */
+ inline bool is_corrupted() const;
+
+ /** Detach the virtual columns from the index that is to be removed. */
+ void detach_columns()
+ {
+ if (!has_virtual() || !cached)
+ return;
+ for (unsigned i= 0; i < n_fields; i++)
+ {
+ dict_col_t* col= fields[i].col;
+ if (!col || !col->is_virtual())
+ continue;
+ col->detach(*this);
+ }
+ }
+
+ /** Determine how many fields of a given prefix can be set NULL.
+ @param[in] n_prefix number of fields in the prefix
+ @return number of fields 0..n_prefix-1 that can be set NULL */
+ unsigned get_n_nullable(ulint n_prefix) const
+ {
+ DBUG_ASSERT(n_prefix > 0);
+ DBUG_ASSERT(n_prefix <= n_fields);
+ unsigned n = n_nullable;
+ for (; n_prefix < n_fields; n_prefix++) {
+ const dict_col_t* col = fields[n_prefix].col;
+ DBUG_ASSERT(!col->is_virtual());
+ n -= col->is_nullable();
+ }
+ DBUG_ASSERT(n < n_def);
+ return n;
+ }
+
+ /** Get the default value of an instantly-added clustered index field.
+ @param[in] n instantly added field position
+ @param[out] len value length (in bytes), or UNIV_SQL_NULL
+ @return default value
+ @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+ const byte* instant_field_value(ulint n, ulint* len) const
+ {
+ DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID);
+ DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields);
+ DBUG_ASSERT(n < n_fields);
+ return fields[n].col->instant_value(len);
+ }
+
+ /** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
+ @param[in] clustered index definition after instant ALTER TABLE */
+ inline void instant_add_field(const dict_index_t& instant);
+ /** Remove instant ADD COLUMN metadata. */
+ inline void clear_instant_add();
+ /** Remove instant ALTER TABLE metadata. */
+ inline void clear_instant_alter();
+
+ /** Construct the metadata record for instant ALTER TABLE.
+ @param[in] row dummy or default values for existing columns
+ @param[in,out] heap memory heap for allocations
+ @return metadata record */
+ inline dtuple_t*
+ instant_metadata(const dtuple_t& row, mem_heap_t* heap) const;
+
+ /** Check if record in clustered index is historical row.
+ @param[in] rec clustered row
+ @param[in] offsets offsets
+ @return true if row is historical */
+ bool
+ vers_history_row(const rec_t* rec, const rec_offs* offsets);
+
+ /** Check if record in secondary index is historical row.
+ @param[in] rec record in a secondary index
+ @param[out] history_row true if row is historical
+ @return true on error */
+ bool
+ vers_history_row(const rec_t* rec, bool &history_row);
+
+ /** Assign the number of new column to be added as a part
+ of the index
+ @param n_vcol number of virtual columns to be added */
+ void assign_new_v_col(ulint n_vcol)
+ {
+ new_vcol_info= static_cast<dict_add_v_col_info*>
+ (mem_heap_zalloc(heap, sizeof *new_vcol_info));
+ new_vcol_info->n_v_col= n_vcol;
+ }
+
+ /* @return whether index has new virtual column */
+ bool has_new_v_col() const { return new_vcol_info; }
+
+ /* @return number of newly added virtual column */
+ ulint get_new_n_vcol() const
+ { return new_vcol_info ? new_vcol_info->n_v_col : 0; }
+
+ /** Assign the number of collation change fields as a part of the index
+ @param n_cols number of columns whose collation is changing */
+ void init_change_cols(unsigned n_cols)
+ {
+ ut_ad(n_fields > n_cols || type & DICT_FTS);
+ change_col_info= static_cast<col_info*>
+ (mem_heap_zalloc(heap, sizeof(col_info)));
+ change_col_info->n_cols= n_cols;
+ }
+
+ /** Reconstruct the clustered index fields.
+ @return whether metadata is incorrect */
+ inline bool reconstruct_fields();
+
+ /** Check if the index contains a column or a prefix of that column.
+ @param[in] n column number
+ @param[in] is_virtual whether it is a virtual col
+ @return whether the index contains the column or its prefix */
+ bool contains_col_or_prefix(ulint n, bool is_virtual) const
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /** @return a clone of this */
+ dict_index_t* clone() const;
+ /** Clone this index for lazy dropping of the adaptive hash index.
+ @return this or a clone */
+ dict_index_t* clone_if_needed();
+ /** @return number of leaf pages pointed to by the adaptive hash index */
+ inline ulint n_ahi_pages() const;
+ /** @return whether mark_freed() had been invoked */
+ bool freed() const { return UNIV_UNLIKELY(page == 1); }
+ /** Note that the index is waiting for btr_search_lazy_free() */
+ void set_freed() { ut_ad(!freed()); page= 1; }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /** @return whether it is forbidden to invoke clear_instant_add() */
+ bool must_avoid_clear_instant_add() const
+ {
+ if (is_instant())
+ for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+ if (i->to_be_dropped /* || i->online_log*/)
+ return true;
+ return false;
+ }
+
+ /** This ad-hoc class is used by record_size_info only. */
+ class record_size_info_t {
+ public:
+ record_size_info_t()
+ : max_leaf_size(0), shortest_size(0), too_big(false),
+ first_overrun_field_index(SIZE_T_MAX), overrun_size(0)
+ {
+ }
+
+ /** Mark row potentially too big for page and set up first
+ overflow field index. */
+ void set_too_big(size_t field_index)
+ {
+ ut_ad(field_index != SIZE_T_MAX);
+
+ too_big = true;
+ if (first_overrun_field_index > field_index) {
+ first_overrun_field_index = field_index;
+ overrun_size = shortest_size;
+ }
+ }
+
+ /** @return overrun field index or SIZE_T_MAX if nothing
+ overflowed*/
+ size_t get_first_overrun_field_index() const
+ {
+ ut_ad(row_is_too_big());
+ ut_ad(first_overrun_field_index != SIZE_T_MAX);
+ return first_overrun_field_index;
+ }
+
+ size_t get_overrun_size() const
+ {
+ ut_ad(row_is_too_big());
+ return overrun_size;
+ }
+
+ bool row_is_too_big() const { return too_big; }
+
+ size_t max_leaf_size; /** Bigger row size this index can
+ produce */
+ size_t shortest_size; /** shortest because it counts everything
+ as in overflow pages */
+
+ private:
+ bool too_big; /** This one is true when maximum row size this
+ index can produce is bigger than maximum row
+ size given page can hold. */
+ size_t first_overrun_field_index; /** After adding this field
+ index row overflowed maximum
+ allowed size. Useful for
+ reporting back to user. */
+ size_t overrun_size; /** Just overrun row size */
+ };
+
+ /** Returns max possibly record size for that index, size of a shortest
+ everything in overflow) size of the longest possible row and index
+ of a field which made index records too big to fit on a page.*/
+ inline record_size_info_t record_size_info() const;
+
+ /** Clear the index tree and reinitialize the root page, in the
+ rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+ @param thr query thread
+ @return error code */
+ dberr_t clear(que_thr_t *thr);
+
+ /** Check whether the online log is dummy value to indicate
+ whether table undergoes active DDL.
+ @retval true if online log is dummy value */
+ bool online_log_is_dummy() const
+ {
+ return online_log == reinterpret_cast<const row_log_t*>(this);
+ }
+
+ /** Assign clustered index online log to dummy value */
+ void online_log_make_dummy()
+ {
+ online_log= reinterpret_cast<row_log_t*>(this);
+ }
+};
+
+/** Detach a virtual column from an index.
+@param index being-freed index */
+inline void dict_col_t::detach(const dict_index_t &index)
+{
+ if (is_virtual())
+ reinterpret_cast<dict_v_col_t*>(this)->detach(index);
+}
+
+/** Add a field definition to an index.
+@param index index
+@param name pointer to column name
+@param prefix_len column prefix length, or 0
+@param descending whether to use descending order */
+inline void dict_mem_index_add_field(dict_index_t *index, const char *name,
+ ulint prefix_len, bool descending= false)
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ dict_field_t &field= index->fields[index->n_def++];
+ field.name= name;
+ field.prefix_len= prefix_len & ((1U << 12) - 1);
+ field.descending= descending;
+}
+
+/** The status of online index creation */
+enum online_index_status {
+ /** the index is complete and ready for access */
+ ONLINE_INDEX_COMPLETE = 0,
+ /** the index is being created, online
+ (allowing concurrent modifications) */
+ ONLINE_INDEX_CREATION,
+ /** secondary index creation was aborted and the index
+ should be dropped as soon as index->table->n_ref_count reaches 0,
+ or online table rebuild was aborted and the clustered index
+ of the original table should soon be restored to
+ ONLINE_INDEX_COMPLETE */
+ ONLINE_INDEX_ABORTED,
+ /** the online index creation was aborted, the index was
+ dropped from the data dictionary and the tablespace, and it
+ should be dropped from the data dictionary cache as soon as
+ index->table->n_ref_count reaches 0. */
+ ONLINE_INDEX_ABORTED_DROPPED
+};
+
+/** Set to store the virtual columns which are affected by Foreign
+key constraint. */
+typedef std::set<dict_v_col_t*, std::less<dict_v_col_t*>,
+ ut_allocator<dict_v_col_t*> > dict_vcol_set;
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_t{
+ mem_heap_t* heap; /*!< this object is allocated from
+ this memory heap */
+ char* id; /*!< id of the constraint as a
+ null-terminated string */
+ unsigned n_fields:10; /*!< number of indexes' first fields
+ for which the foreign key
+ constraint is defined: we allow the
+ indexes to contain more fields than
+ mentioned in the constraint, as long
+ as the first fields are as mentioned */
+ unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+ or DICT_FOREIGN_ON_DELETE_SET_NULL */
+ char* foreign_table_name;/*!< foreign table name */
+ char* foreign_table_name_lookup;
+ /*!< foreign table name used for dict lookup */
+ dict_table_t* foreign_table; /*!< table where the foreign key is */
+ const char** foreign_col_names;/*!< names of the columns in the
+ foreign key */
+ char* referenced_table_name;/*!< referenced table name */
+ char* referenced_table_name_lookup;
+ /*!< referenced table name for dict lookup*/
+ dict_table_t* referenced_table;/*!< table where the referenced key
+ is */
+ const char** referenced_col_names;/*!< names of the referenced
+ columns in the referenced table */
+ dict_index_t* foreign_index; /*!< foreign index; we require that
+ both tables contain explicitly defined
+ indexes for the constraint: InnoDB
+ does not generate new indexes
+ implicitly */
+ dict_index_t* referenced_index;/*!< referenced index */
+
+ dict_vcol_set* v_cols; /*!< set of virtual columns affected
+ by foreign key constraint. */
+
+ /** Check whether the fulltext index gets affected by
+ foreign key constraint */
+ bool affects_fulltext() const;
+};
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign);
+
+struct dict_foreign_print {
+
+ dict_foreign_print(std::ostream& out)
+ : m_out(out)
+ {}
+
+ void operator()(const dict_foreign_t* foreign) {
+ m_out << *foreign;
+ }
+private:
+ std::ostream& m_out;
+};
+
+/** Compare two dict_foreign_t objects using their ids. Used in the ordering
+of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns
+true if the first argument is considered to go before the second in the
+strict weak ordering it defines, and false otherwise. */
+struct dict_foreign_compare {
+
+ bool operator()(
+ const dict_foreign_t* lhs,
+ const dict_foreign_t* rhs) const
+ {
+ return strcmp(lhs->id, rhs->id) < 0;
+ }
+};
+
+/** A function object to find a foreign key with the given index as the
+referenced index. Return the foreign key with matching criteria or NULL */
+struct dict_foreign_with_index {
+
+ dict_foreign_with_index(const dict_index_t* index)
+ : m_index(index)
+ {}
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ return(foreign->referenced_index == m_index);
+ }
+
+ const dict_index_t* m_index;
+};
+
+/* A function object to check if the foreign constraint is between different
+tables. Returns true if foreign key constraint is between different tables,
+false otherwise. */
+struct dict_foreign_different_tables {
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ return(foreign->foreign_table != foreign->referenced_table);
+ }
+};
+
+/** A function object to check if the foreign key constraint has the same
+name as given. If the full name of the foreign key constraint doesn't match,
+then, check if removing the database name from the foreign key constraint
+matches. Return true if it matches, false otherwise. */
+struct dict_foreign_matches_id {
+
+ dict_foreign_matches_id(const char* id)
+ : m_id(id)
+ {}
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ if (0 == innobase_strcasecmp(foreign->id, m_id)) {
+ return(true);
+ }
+ if (const char* pos = strchr(foreign->id, '/')) {
+ if (0 == innobase_strcasecmp(m_id, pos + 1)) {
+ return(true);
+ }
+ }
+ return(false);
+ }
+
+ const char* m_id;
+};
+
+typedef std::set<
+ dict_foreign_t*,
+ dict_foreign_compare,
+ ut_allocator<dict_foreign_t*> > dict_foreign_set;
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set);
+
+/** Function object to check if a foreign key object is there
+in the given foreign key set or not. It returns true if the
+foreign key is not found, false otherwise */
+struct dict_foreign_not_exists {
+ dict_foreign_not_exists(const dict_foreign_set& obj_)
+ : m_foreigns(obj_)
+ {}
+
+ /* Return true if the given foreign key is not found */
+ bool operator()(dict_foreign_t* const & foreign) const {
+ return(m_foreigns.find(foreign) == m_foreigns.end());
+ }
+private:
+ const dict_foreign_set& m_foreigns;
+};
+
+/** Validate the search order in the foreign key set.
+@param[in] fk_set the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+ const dict_foreign_set& fk_set);
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in] table table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+ const dict_table_t& table);
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+inline
+void
+dict_foreign_free(
+/*==============*/
+ dict_foreign_t* foreign) /*!< in, own: foreign key struct */
+{
+ if (foreign->v_cols != NULL) {
+ UT_DELETE(foreign->v_cols);
+ }
+
+ mem_heap_free(foreign->heap);
+}
+
+/** The destructor will free all the foreign key constraints in the set
+by calling dict_foreign_free() on each of the foreign key constraints.
+This is used to free the allocated memory when a local set goes out
+of scope. */
+struct dict_foreign_set_free {
+
+ dict_foreign_set_free(const dict_foreign_set& foreign_set)
+ : m_foreign_set(foreign_set)
+ {}
+
+ ~dict_foreign_set_free()
+ {
+ std::for_each(m_foreign_set.begin(),
+ m_foreign_set.end(),
+ dict_foreign_free);
+ }
+
+ const dict_foreign_set& m_foreign_set;
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE 1U /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL 2U /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE 4U /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8U /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U /*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U /*!< ON UPDATE NO ACTION */
+/* @} */
+
+/** Display an identifier.
+@param[in,out] s output stream
+@param[in] id_name SQL identifier (other than table name)
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& s,
+ const id_name_t& id_name);
+
+/** Display a table name.
+@param[in,out] s output stream
+@param[in] table_name table name
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& s,
+ const table_name_t& table_name);
+
+/** List of locks that different transactions have acquired on a table. This
+list has a list node that is embedded in a nested union/structure. We have to
+generate a specific template for it. */
+
+typedef ut_list_base<lock_t, ut_list_node<lock_t> lock_table_t::*>
+ table_lock_list_t;
+
+/** mysql template structure defined in row0mysql.cc */
+struct mysql_row_templ_t;
+
+/** Structure defines template related to virtual columns and
+their base columns */
+struct dict_vcol_templ_t {
+ /** number of regular columns */
+ ulint n_col;
+
+ /** number of virtual columns */
+ ulint n_v_col;
+
+ /** array of templates for virtual col and their base columns */
+ mysql_row_templ_t** vtempl;
+
+ /** table's database name */
+ std::string db_name;
+
+ /** table name */
+ std::string tb_name;
+
+ /** MySQL record length */
+ ulint rec_len;
+
+ /** default column value if any */
+ byte* default_rec;
+
+ /** cached MySQL TABLE object */
+ TABLE* mysql_table;
+
+ /** when mysql_table was cached */
+ uint64_t mysql_table_query_id;
+
+ dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {}
+};
+
+/** Metadata on clustered index fields starting from first_user_field() */
+class field_map_element_t
+{
+ /** Number of bits for representing a column number */
+ static constexpr uint16_t IND_BITS = 10;
+
+ /** Set if the column of the field has been instantly dropped */
+ static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5);
+
+ /** Set if the column was dropped and originally declared NOT NULL */
+ static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4);
+
+ /** Column index (if !(data & DROPPED)): table->cols[data & IND],
+ or field length (if (data & DROPPED)):
+ (data & IND) = 0 if variable-length with max_len < 256 bytes;
+ (data & IND) = 1 if variable-length with max_len > 255 bytes;
+ (data & IND) = 1 + L otherwise, with L=fixed length of the column */
+ static constexpr uint16_t IND = (1U << IND_BITS) - 1;
+
+ /** Field metadata */
+ uint16_t data;
+
+ void clear_not_null() { data &= uint16_t(~NOT_NULL); }
+public:
+ bool is_dropped() const { return data & DROPPED; }
+ void set_dropped() { data |= DROPPED; }
+ bool is_not_null() const { return data & NOT_NULL; }
+ void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; }
+ uint16_t ind() const { return data & IND; }
+ void set_ind(uint16_t i)
+ {
+ DBUG_ASSERT(i <= IND);
+ DBUG_ASSERT(!ind());
+ data |= i;
+ }
+ field_map_element_t& operator= (uint16_t value)
+ {
+ data = value;
+ return *this;
+ }
+ operator uint16_t() { return data; }
+};
+
+static_assert(sizeof(field_map_element_t) == 2,
+ "Size mismatch for a persistent data item!");
+
+/** Instantly dropped or reordered columns */
+struct dict_instant_t
+{
+ /** Number of dropped columns */
+ unsigned n_dropped;
+ /** Dropped columns */
+ dict_col_t* dropped;
+ /** Map of clustered index non-PK fields[i - first_user_field()]
+ to table columns */
+ field_map_element_t* field_map;
+};
+
+/** These are used when MySQL FRM and InnoDB data dictionary are
+in inconsistent state. */
+typedef enum {
+ DICT_FRM_CONSISTENT = 0, /*!< Consistent state */
+ DICT_FRM_NO_PK = 1, /*!< MySQL has no primary key
+ but InnoDB dictionary has
+ non-generated one. */
+ DICT_NO_PK_FRM_HAS = 2, /*!< MySQL has primary key but
+ InnoDB dictionary has not. */
+ DICT_FRM_INCONSISTENT_KEYS = 3 /*!< Key count mismatch */
+} dict_frm_t;
+
+/** Data structure for a database table. Most fields will be
+zero-initialized in dict_table_t::create(). */
+struct dict_table_t {
+
+ /** Get reference count.
+ @return current value of n_ref_count */
+ inline uint32_t get_ref_count() const { return n_ref_count; }
+
+ /** Acquire the table handle. */
+ inline void acquire();
+
+ /** Release the table handle.
+ @return whether the last handle was released */
+ inline bool release();
+
+ /** @return whether the table supports transactions */
+ bool no_rollback() const
+ {
+ return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK);
+ }
+ /** @return whether this is a temporary table */
+ bool is_temporary() const
+ {
+ return flags2 & DICT_TF2_TEMPORARY;
+ }
+
+ /** @return whether the table is not in ROW_FORMAT=REDUNDANT */
+ bool not_redundant() const { return flags & DICT_TF_COMPACT; }
+
+ /** @return whether this table is readable
+ @retval true normally
+ @retval false if this is a single-table tablespace
+ and the .ibd file is missing, or a
+ page cannot be read or decrypted */
+ bool is_readable() const
+ {
+ ut_ad(file_unreadable || space);
+ return(UNIV_LIKELY(!file_unreadable));
+ }
+
+ /** @return whether the table is accessible */
+ bool is_accessible() const
+ {
+ return UNIV_LIKELY(is_readable() && !corrupted && space)
+ && !space->is_stopping();
+ }
+
+ /** Check if a table name contains the string "/#sql"
+ which denotes temporary or intermediate tables in MariaDB. */
+ static bool is_temporary_name(const char* name)
+ {
+ return strstr(name, "/#sql");
+ }
+
+ /** @return whether instant ALTER TABLE is in effect */
+ bool is_instant() const
+ {
+ return(UT_LIST_GET_FIRST(indexes)->is_instant());
+ }
+
+ /** @return whether the table supports instant ALTER TABLE */
+ bool supports_instant() const
+ {
+ return(!(flags & DICT_TF_MASK_ZIP_SSIZE));
+ }
+
+ /** @return the number of instantly dropped columns */
+ unsigned n_dropped() const { return instant ? instant->n_dropped : 0; }
+
+ /** Look up an old column.
+ @param[in] cols the old columns of the table
+ @param[in] col_map map from old table columns to altered ones
+ @param[in] n_cols number of old columns
+ @param[in] i the number of the new column
+ @return old column
+ @retval NULL if column i was added to the table */
+ static const dict_col_t* find(const dict_col_t* cols,
+ const ulint* col_map, ulint n_cols,
+ ulint i)
+ {
+ for (ulint o = n_cols; o--; ) {
+ if (col_map[o] == i) {
+ return &cols[o];
+ }
+ }
+ return NULL;
+ }
+
+ /** Serialise metadata of dropped or reordered columns.
+ @param[in,out] heap memory heap for allocation
+ @param[out] field data field with the metadata */
+ inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const;
+
+ /** Reconstruct dropped or reordered columns.
+ @param[in] metadata data from serialise_columns()
+ @param[in] len length of the metadata, in bytes
+ @return whether parsing the metadata failed */
+ bool deserialise_columns(const byte* metadata, ulint len);
+
+ /** Set is_instant() before instant_column().
+ @param[in] old previous table definition
+ @param[in] col_map map from old.cols[]
+ and old.v_cols[] to this
+ @param[out] first_alter_pos 0, or
+ 1 + first changed column position */
+ inline void prepare_instant(const dict_table_t& old,
+ const ulint* col_map,
+ unsigned& first_alter_pos);
+
+ /** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+ @param[in] table table on which prepare_instant() was invoked
+ @param[in] col_map mapping from cols[] and v_cols[] to table
+ @return whether the metadata record must be updated */
+ inline bool instant_column(const dict_table_t& table,
+ const ulint* col_map);
+
+ /** Roll back instant_column().
+ @param[in] old_n_cols original n_cols
+ @param[in] old_cols original cols
+ @param[in] old_col_names original col_names
+ @param[in] old_instant original instant structure
+ @param[in] old_fields original fields
+ @param[in] old_n_fields original number of fields
+ @param[in] old_n_core_fields original number of core fields
+ @param[in] old_n_v_cols original n_v_cols
+ @param[in] old_v_cols original v_cols
+ @param[in] old_v_col_names original v_col_names
+ @param[in] col_map column map */
+ inline void rollback_instant(
+ unsigned old_n_cols,
+ dict_col_t* old_cols,
+ const char* old_col_names,
+ dict_instant_t* old_instant,
+ dict_field_t* old_fields,
+ unsigned old_n_fields,
+ unsigned old_n_core_fields,
+ unsigned old_n_v_cols,
+ dict_v_col_t* old_v_cols,
+ const char* old_v_col_names,
+ const ulint* col_map);
+
+ /** Add the table definition to the data dictionary cache */
+ void add_to_cache();
+
+ /** @return whether the table is versioned.
+ It is assumed that both vers_start and vers_end set to 0
+ iff table is not versioned. In any other case,
+ these fields correspond to actual positions in cols[]. */
+ bool versioned() const { return vers_start || vers_end; }
+ bool versioned_by_id() const
+ {
+ return versioned() && cols[vers_start].mtype == DATA_INT;
+ }
+
+ /** For overflow fields returns potential max length stored inline */
+ inline size_t get_overflow_field_local_len() const;
+
+ /** Parse the table file name into table name and database name.
+ @tparam dict_frozen whether the caller holds dict_sys.latch
+ @param[in,out] db_name database name buffer
+ @param[in,out] tbl_name table name buffer
+ @param[out] db_name_len database name length
+ @param[out] tbl_name_len table name length
+ @return whether the table name is visible to SQL */
+ template<bool dict_frozen= false>
+ bool parse_name(char (&db_name)[NAME_LEN + 1],
+ char (&tbl_name)[NAME_LEN + 1],
+ size_t *db_name_len, size_t *tbl_name_len) const;
+
+ /** Clear the table when rolling back TRX_UNDO_EMPTY
+ @return error code */
+ dberr_t clear(que_thr_t *thr);
+
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread holds the lock_mutex */
+ bool lock_mutex_is_owner() const
+ { return lock_mutex_owner == pthread_self(); }
+ /** @return whether the current thread holds the stats_mutex (lock_mutex) */
+ bool stats_mutex_is_owner() const
+ { return lock_mutex_owner == pthread_self(); }
+#endif /* UNIV_DEBUG */
+ void lock_mutex_init() { lock_mutex.init(); }
+ void lock_mutex_destroy() { lock_mutex.destroy(); }
+ /** Acquire lock_mutex */
+ void lock_mutex_lock()
+ {
+ ut_ad(!lock_mutex_is_owner());
+ lock_mutex.wr_lock();
+ ut_ad(!lock_mutex_owner.exchange(pthread_self()));
+ }
+ /** Try to acquire lock_mutex */
+ bool lock_mutex_trylock()
+ {
+ ut_ad(!lock_mutex_is_owner());
+ bool acquired= lock_mutex.wr_lock_try();
+ ut_ad(!acquired || !lock_mutex_owner.exchange(pthread_self()));
+ return acquired;
+ }
+ /** Release lock_mutex */
+ void lock_mutex_unlock()
+ {
+ ut_ad(lock_mutex_owner.exchange(0) == pthread_self());
+ lock_mutex.wr_unlock();
+ }
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether the lock mutex is held by some thread */
+ bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); }
+#endif
+
+ /* stats mutex lock currently defaults to lock_mutex but in the future,
+ there could be a use-case to have separate mutex for stats.
+ extra indirection (through inline so no performance hit) should
+ help simplify code and increase long-term maintainability */
+ void stats_mutex_init() { lock_mutex_init(); }
+ void stats_mutex_destroy() { lock_mutex_destroy(); }
+ void stats_mutex_lock() { lock_mutex_lock(); }
+ void stats_mutex_unlock() { lock_mutex_unlock(); }
+
+ /** Rename the data file.
+ @param new_name name of the table
+ @param replace whether to replace the file with the new name
+ (as part of rolling back TRUNCATE) */
+ dberr_t rename_tablespace(span<const char> new_name, bool replace) const;
+
+private:
+ /** Initialize instant->field_map.
+ @param[in] table table definition to copy from */
+ inline void init_instant(const dict_table_t& table);
+public:
+ /** Id of the table. */
+ table_id_t id;
+ /** dict_sys.id_hash chain node */
+ dict_table_t* id_hash;
+ /** Table name in name_hash */
+ table_name_t name;
+ /** dict_sys.name_hash chain node */
+ dict_table_t* name_hash;
+
+ /** Memory heap */
+ mem_heap_t* heap;
+
+ /** NULL or the directory path specified by DATA DIRECTORY. */
+ char* data_dir_path;
+
+ /** The tablespace of the table */
+ fil_space_t* space;
+ /** Tablespace ID */
+ uint32_t space_id;
+
+ /** Stores information about:
+ 1 row format (redundant or compact),
+ 2 compressed page size (zip shift size),
+ 3 whether using atomic blobs,
+ 4 whether the table has been created with the option DATA DIRECTORY.
+ Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(),
+ DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this
+ flag. */
+ unsigned flags:DICT_TF_BITS;
+
+ /** Stores information about:
+ 1 whether the table has been created using CREATE TEMPORARY TABLE,
+ 2 whether the table has an internally defined DOC ID column,
+ 3 whether the table has a FTS index,
+ 4 whether DOC ID column need to be added to the FTS index,
+ 5 whether the table is being created its own tablespace,
+ 6 whether the table has been DISCARDed,
+ 7 whether the aux FTS tables names are in hex.
+ Use DICT_TF2_FLAG_IS_SET() to parse this flag. */
+ unsigned flags2:DICT_TF2_BITS;
+
+ /** TRUE if the table is an intermediate table during copy alter
+ operation or a partition/subpartition which is required for copying
+ data and skip the undo log for insertion of row in the table.
+ This variable will be set and unset during extra(), or during the
+ process of altering partitions */
+ unsigned skip_alter_undo:1;
+
+ /*!< whether this is in a single-table tablespace and the .ibd
+ file is missing or page decryption failed and page is corrupted */
+ unsigned file_unreadable:1;
+
+ /** TRUE if the table object has been added to the dictionary cache. */
+ unsigned cached:1;
+
+ /** Number of non-virtual columns defined so far. */
+ unsigned n_def:10;
+
+ /** Number of non-virtual columns. */
+ unsigned n_cols:10;
+
+ /** Number of total columns (inlcude virtual and non-virtual) */
+ unsigned n_t_cols:10;
+
+ /** Number of total columns defined so far. */
+ unsigned n_t_def:10;
+
+ /** Number of virtual columns defined so far. */
+ unsigned n_v_def:10;
+
+ /** Number of virtual columns. */
+ unsigned n_v_cols:10;
+
+ /** 1 + the position of autoinc counter field in clustered
+ index, or 0 if there is no persistent AUTO_INCREMENT column in
+ the table. */
+ unsigned persistent_autoinc:10;
+
+ /** TRUE if it's not an InnoDB system table or a table that has no FK
+ relationships. */
+ unsigned can_be_evicted:1;
+
+ /** TRUE if table is corrupted. */
+ unsigned corrupted:1;
+
+ /** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED
+ or ONLINE_INDEX_ABORTED_DROPPED. */
+ unsigned drop_aborted:1;
+
+ /** Array of column descriptions. */
+ dict_col_t* cols;
+
+ /** Array of virtual column descriptions. */
+ dict_v_col_t* v_cols;
+
+ /** List of stored column descriptions. It is used only for foreign key
+ check during create table and copy alter operations.
+ During copy alter, s_cols list is filled during create table operation
+ and need to preserve till rename table operation. That is the
+ reason s_cols is a part of dict_table_t */
+ dict_s_col_list* s_cols;
+
+ /** Instantly dropped or reordered columns, or NULL if none */
+ dict_instant_t* instant;
+
+ /** Column names packed in a character string
+ "name1\0name2\0...nameN\0". Until the string contains n_cols, it will
+ be allocated from a temporary heap. The final string will be allocated
+ from table->heap. */
+ const char* col_names;
+
+ /** Virtual column names */
+ const char* v_col_names;
+ unsigned vers_start:10;
+ /*!< System Versioning: row start col index */
+ unsigned vers_end:10;
+ /*!< System Versioning: row end col index */
+ bool is_system_db;
+ /*!< True if the table belongs to a system
+ database (mysql, information_schema or
+ performance_schema) */
+ dict_frm_t dict_frm_mismatch;
+ /*!< !DICT_FRM_CONSISTENT==0 if data
+ dictionary information and
+ MySQL FRM information mismatch. */
+ /** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */
+ dict_index_t* fts_doc_id_index;
+
+ /** List of indexes of the table. */
+ UT_LIST_BASE_NODE_T(dict_index_t) indexes;
+#ifdef BTR_CUR_HASH_ADAPT
+ /** List of detached indexes that are waiting to be freed along with
+ the last adaptive hash index entry.
+ Protected by autoinc_mutex (sic!) */
+ UT_LIST_BASE_NODE_T(dict_index_t) freed_indexes;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /** List of foreign key constraints in the table. These refer to
+ columns in other tables. */
+ UT_LIST_BASE_NODE_T(dict_foreign_t) foreign_list;
+
+ /** List of foreign key constraints which refer to this table. */
+ UT_LIST_BASE_NODE_T(dict_foreign_t) referenced_list;
+
+ /** Node of the LRU list of tables. */
+ UT_LIST_NODE_T(dict_table_t) table_LRU;
+
+ /** Maximum recursive level we support when loading tables chained
+ together with FK constraints. If exceeds this level, we will stop
+ loading child table into memory along with its parent table. */
+ byte fk_max_recusive_level;
+
+ /** DDL transaction that last touched the table definition, or 0 if
+ no history is available. This includes possible changes in
+ ha_innobase::prepare_inplace_alter_table() and
+ ha_innobase::commit_inplace_alter_table(). */
+ trx_id_t def_trx_id;
+
+ /** Last transaction that inserted into an empty table.
+ Updated while holding exclusive table lock and an exclusive
+ latch on the clustered index root page (which must also be
+ an empty leaf page), and an ahi_latch (if btr_search_enabled). */
+ Atomic_relaxed<trx_id_t> bulk_trx_id;
+
+ /** Original table name, for MDL acquisition in purge. Normally,
+ this points to the same as name. When is_temporary_name(name.m_name) holds,
+ this should be a copy of the original table name, allocated from heap. */
+ table_name_t mdl_name;
+
+ /*!< set of foreign key constraints in the table; these refer to
+ columns in other tables */
+ dict_foreign_set foreign_set;
+
+ /*!< set of foreign key constraints which refer to this table */
+ dict_foreign_set referenced_set;
+
+ /** Statistics for query optimization. Mostly protected by
+ dict_sys.latch and stats_mutex_lock(). @{ */
+
+ /** TRUE if statistics have been calculated the first time after
+ database startup or table creation. */
+ unsigned stat_initialized:1;
+
+ /** Timestamp of last recalc of the stats. */
+ time_t stats_last_recalc;
+
+ /** The two bits below are set in the 'stat_persistent' member. They
+ have the following meaning:
+ 1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
+ the value of the global srv_stats_persistent is used to determine
+ whether the table has persistent stats enabled or not
+ 2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
+ table, regardless of the value of the global srv_stats_persistent
+ 3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
+ table, regardless of the value of the global srv_stats_persistent
+ 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+ #define DICT_STATS_PERSISTENT_ON (1 << 1)
+ #define DICT_STATS_PERSISTENT_OFF (1 << 2)
+
+ /** Indicates whether the table uses persistent stats or not. See
+ DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
+ ib_uint32_t stat_persistent;
+
+ /** The two bits below are set in the 'stats_auto_recalc' member. They
+ have the following meaning:
+ 1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
+ value of the global srv_stats_persistent_auto_recalc is used to
+ determine whether the table has auto recalc enabled or not
+ 2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
+ regardless of the value of the global srv_stats_persistent_auto_recalc
+ 3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
+ regardless of the value of the global srv_stats_persistent_auto_recalc
+ 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+ #define DICT_STATS_AUTO_RECALC_ON (1 << 1)
+ #define DICT_STATS_AUTO_RECALC_OFF (1 << 2)
+
+ /** Indicates whether the table uses automatic recalc for persistent
+ stats or not. See DICT_STATS_AUTO_RECALC_ON and
+ DICT_STATS_AUTO_RECALC_OFF. */
+ ib_uint32_t stats_auto_recalc;
+
+ /** The number of pages to sample for this table during persistent
+ stats estimation. If this is 0, then the value of the global
+ srv_stats_persistent_sample_pages will be used instead. */
+ ulint stats_sample_pages;
+
+ /** Approximate number of rows in the table. We periodically calculate
+ new estimates. */
+ ib_uint64_t stat_n_rows;
+
+ /** Approximate clustered index size in database pages. */
+ ulint stat_clustered_index_size;
+
+ /** Approximate size of other indexes in database pages. */
+ ulint stat_sum_of_other_index_sizes;
+
+ /** How many rows are modified since last stats recalc. When a row is
+ inserted, updated, or deleted, we add 1 to this number; we calculate
+ new estimates for the table and the indexes if the table has changed
+ too much, see dict_stats_update_if_needed(). The counter is reset
+ to zero at statistics calculation. This counter is not protected by
+ any latch, because this is only used for heuristics. */
+ ib_uint64_t stat_modified_counter;
+
+ bool stats_error_printed;
+ /*!< Has persistent stats error beein
+ already printed for this table ? */
+ /* @} */
+
+ /** AUTOINC related members. @{ */
+
+ /* The actual collection of tables locked during AUTOINC read/write is
+ kept in trx_t. In order to quickly determine whether a transaction has
+ locked the AUTOINC lock we keep a pointer to the transaction here in
+ the 'autoinc_trx' member. This is to avoid acquiring the
+ lock_sys.latch and scanning the vector in trx_t.
+ When an AUTOINC lock has to wait, the corresponding lock instance is
+ created on the trx lock heap rather than use the pre-allocated instance
+ in autoinc_lock below. */
+
+ /** A buffer for an AUTOINC lock for this table. We allocate the
+ memory here so that individual transactions can get it and release it
+ without a need to allocate space from the lock heap of the trx:
+ otherwise the lock heap would grow rapidly if we do a large insert
+ from a select. */
+ lock_t* autoinc_lock;
+
+ /** Mutex protecting autoinc and freed_indexes. */
+ srw_spin_mutex autoinc_mutex;
+private:
+ /** Mutex protecting locks on this table. */
+ srw_spin_mutex lock_mutex;
+#ifdef UNIV_DEBUG
+ /** The owner of lock_mutex (0 if none) */
+ Atomic_relaxed<pthread_t> lock_mutex_owner{0};
+#endif
+public:
+ /** Autoinc counter value to give to the next inserted row. */
+ uint64_t autoinc;
+
+ /** The transaction that currently holds the the AUTOINC lock on this table.
+ Protected by lock_mutex.
+ The thread that is executing autoinc_trx may read this field without
+ holding a latch, in row_lock_table_autoinc_for_mysql().
+ Only the autoinc_trx thread may clear this field; it cannot be
+ modified on the behalf of a transaction that is being handled by a
+ different thread. */
+ Atomic_relaxed<const trx_t*> autoinc_trx;
+
+ /** Number of granted or pending autoinc_lock on this table. This
+ value is set after acquiring lock_sys.latch but
+ in innodb_autoinc_lock_mode=1 (the default),
+ ha_innobase::innobase_lock_autoinc() will perform a dirty read
+ to determine whether other transactions have acquired the autoinc_lock. */
+ uint32_t n_waiting_or_granted_auto_inc_locks;
+
+ /* @} */
+
+ /** Number of granted or pending LOCK_S or LOCK_X on the table.
+ Protected by lock_sys.assert_locked(*this). */
+ uint32_t n_lock_x_or_s;
+
+ /** FTS specific state variables. */
+ fts_t* fts;
+
+ /** Quiescing states, protected by the dict_index_t::lock. ie. we can
+ only change the state if we acquire all the latches (dict_index_t::lock)
+ in X mode of this table's indexes. */
+ ib_quiesce_t quiesce;
+
+ /** Count of the number of record locks on this table. We use this to
+ determine whether we can evict the table from the dictionary cache.
+ Modified when lock_sys.is_writer(), or
+ lock_sys.assert_locked(page_id) and trx->mutex_is_owner() hold.
+ @see trx_lock_t::trx_locks */
+ Atomic_counter<uint32_t> n_rec_locks;
+private:
+ /** Count of how many handles are opened to this table. Dropping of the
+ table is NOT allowed until this count gets to zero. MySQL does NOT
+ itself check the number of open handles at DROP. */
+ Atomic_counter<uint32_t> n_ref_count;
+public:
+ /** List of locks on the table. Protected by lock_sys.assert_locked(lock). */
+ table_lock_list_t locks;
+
+ /** Timestamp of the last modification of this table. */
+ Atomic_relaxed<time_t> update_time;
+ /** Transactions whose view low limit is greater than this number are
+ not allowed to access the MariaDB query cache.
+ @see innobase_query_caching_table_check_low()
+ @see trx_t::commit_tables() */
+ Atomic_relaxed<trx_id_t> query_cache_inv_trx_id;
+
+#ifdef UNIV_DEBUG
+ /** Value of 'magic_n'. */
+ #define DICT_TABLE_MAGIC_N 76333786
+
+ /** Magic number. */
+ ulint magic_n;
+#endif /* UNIV_DEBUG */
+ /** mysql_row_templ_t for base columns used for compute the virtual
+ columns */
+ dict_vcol_templ_t* vc_templ;
+
+ /* @return whether the table has any other transcation lock
+ other than the given transaction */
+ bool has_lock_other_than(const trx_t *trx) const
+ {
+ for (lock_t *lock= UT_LIST_GET_FIRST(locks); lock;
+ lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+ if (lock->trx != trx)
+ return true;
+ return false;
+ }
+
+ /** @return whether a DDL operation is in progress on this table */
+ bool is_active_ddl() const
+ {
+ return UT_LIST_GET_FIRST(indexes)->online_log;
+ }
+
+ /** @return whether the name is
+ mysql.innodb_index_stats or mysql.innodb_table_stats */
+ bool is_stats_table() const;
+
+ /** @return number of unique columns in FTS_DOC_ID index */
+ unsigned fts_n_uniq() const { return versioned() ? 2 : 1; }
+
+ /** Create metadata.
+ @param name table name
+ @param space tablespace
+ @param n_cols total number of columns (both virtual and non-virtual)
+ @param n_v_cols number of virtual columns
+ @param flags table flags
+ @param flags2 table flags2
+ @return newly allocated table object */
+ static dict_table_t *create(const span<const char> &name, fil_space_t *space,
+ ulint n_cols, ulint n_v_cols, ulint flags,
+ ulint flags2);
+
+ /** Check whether the table has any spatial indexes */
+ bool has_spatial_index() const
+ {
+ for (auto i= UT_LIST_GET_FIRST(indexes);
+ (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+ if (i->is_spatial())
+ return true;
+ return false;
+ }
+};
+
+inline void dict_index_t::set_modified(mtr_t& mtr) const
+{
+ mtr.set_named_space(table->space);
+}
+
+inline bool table_name_t::is_temporary() const
+{
+ return dict_table_t::is_temporary_name(m_name);
+}
+
+inline bool dict_index_t::is_readable() const { return table->is_readable(); }
+
+inline bool dict_index_t::is_instant() const
+{
+ ut_ad(n_core_fields > 0);
+ ut_ad(n_core_fields <= n_fields || table->n_dropped());
+ ut_ad(n_core_fields == n_fields
+ || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED);
+ ut_ad(n_core_fields == n_fields || table->supports_instant());
+ ut_ad(n_core_fields == n_fields || !table->is_temporary());
+ ut_ad(!table->instant || !table->is_temporary());
+
+ return n_core_fields != n_fields
+ || (is_primary() && table->instant);
+}
+
+inline bool dict_index_t::is_corrupted() const
+{
+ return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED
+ || (type & DICT_CORRUPT)
+ || (table && table->corrupted));
+}
+
+inline void dict_index_t::clear_instant_add()
+{
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(is_instant());
+ DBUG_ASSERT(!table->instant);
+ for (unsigned i= n_core_fields; i < n_fields; i++)
+ fields[i].col->clear_instant();
+ n_core_fields= n_fields;
+ n_core_null_bytes= static_cast<byte>
+ (UT_BITS_IN_BYTES(static_cast<unsigned>(n_nullable)));
+}
+
+inline void dict_index_t::clear_instant_alter()
+{
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(n_fields == n_def);
+
+ if (!table->instant) {
+ if (is_instant()) {
+ clear_instant_add();
+ }
+ return;
+ }
+
+#ifndef DBUG_OFF
+ for (unsigned i = first_user_field(); i--; ) {
+ DBUG_ASSERT(!fields[i].col->is_dropped());
+ DBUG_ASSERT(!fields[i].col->is_nullable());
+ }
+#endif
+ const dict_col_t* ai_col = table->persistent_autoinc
+ ? fields[table->persistent_autoinc - 1].col
+ : NULL;
+ dict_field_t* const begin = &fields[first_user_field()];
+ dict_field_t* end = &fields[n_fields];
+
+ for (dict_field_t* d = begin; d < end; ) {
+ /* Move fields for dropped columns to the end. */
+ if (!d->col->is_dropped()) {
+ d++;
+ } else {
+ if (d->col->is_nullable()) {
+ n_nullable--;
+ }
+
+ std::swap(*d, *--end);
+ }
+ }
+
+ DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end);
+ n_core_fields = n_fields = n_def
+ = static_cast<unsigned>(end - fields) & MAX_N_FIELDS;
+ n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_nullable));
+ std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b)
+ { return a.col->ind < b.col->ind; });
+ table->instant = NULL;
+ if (ai_col) {
+ auto a = std::find_if(fields, end,
+ [ai_col](const dict_field_t& f)
+ { return f.col == ai_col; });
+ table->persistent_autoinc = (a == end)
+ ? 0
+ : (1 + static_cast<unsigned>(a - fields))
+ & MAX_N_FIELDS;
+ }
+}
+
+/** @return whether the column was instantly dropped
+@param[in] index the clustered index */
+inline bool dict_col_t::is_dropped(const dict_index_t& index) const
+{
+ DBUG_ASSERT(index.is_primary());
+ DBUG_ASSERT(!is_dropped() == !index.table->instant);
+ DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped
+ && this < index.table->instant->dropped
+ + index.table->instant->n_dropped));
+ return is_dropped();
+}
+
+/*******************************************************************//**
+Initialise the table lock list. */
+void
+lock_table_lock_list_init(
+/*======================*/
+ table_lock_list_t* locks); /*!< List to initialise */
+
+/** A function object to add the foreign key constraint to the referenced set
+of the referenced table, if it exists in the dictionary cache. */
+struct dict_foreign_add_to_referenced_table {
+ void operator()(dict_foreign_t* foreign) const
+ {
+ if (dict_table_t* table = foreign->referenced_table) {
+ std::pair<dict_foreign_set::iterator, bool> ret
+ = table->referenced_set.insert(foreign);
+ ut_a(ret.second);
+ }
+ }
+};
+
+/** Check whether the col is used in spatial index or regular index.
+@param[in] col column to check
+@return spatial status */
+inline
+spatial_status_t
+dict_col_get_spatial_status(
+ const dict_col_t* col)
+{
+ spatial_status_t spatial_status = SPATIAL_NONE;
+
+ /* Column is not a part of any index. */
+ if (!col->ord_part) {
+ return(spatial_status);
+ }
+
+ if (DATA_GEOMETRY_MTYPE(col->mtype)) {
+ if (col->max_prefix == 0) {
+ spatial_status = SPATIAL_ONLY;
+ } else {
+ /* Any regular index on a geometry column
+ should have a prefix. */
+ spatial_status = SPATIAL_MIXED;
+ }
+ }
+
+ return(spatial_status);
+}
+
+/** Clear defragmentation summary. */
+inline void dict_stats_empty_defrag_summary(dict_index_t* index)
+{
+ index->stat_defrag_n_pages_freed = 0;
+}
+
+/** Clear defragmentation related index stats. */
+inline void dict_stats_empty_defrag_stats(dict_index_t* index)
+{
+ index->stat_defrag_modified_counter = 0;
+ index->stat_defrag_n_page_split = 0;
+}
+
+#include "dict0mem.inl"
+
+#endif /* dict0mem_h */
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
new file mode 100644
index 00000000..d60ee5d9
--- /dev/null
+++ b/storage/innobase/include/dict0mem.inl
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#include "dict0mem.h"
+#include "fil0fil.h"
+
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+ dict_index_t* index, /*!< out: index to be filled */
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields) /*!< in: number of fields */
+{
+
+ if (heap) {
+ index->heap = heap;
+ index->name = mem_heap_strdup(heap, index_name);
+ index->fields = (dict_field_t*) mem_heap_alloc(
+ heap, 1 + n_fields * sizeof(dict_field_t));
+ } else {
+ index->name = index_name;
+ index->heap = NULL;
+ index->fields = NULL;
+ }
+
+ index->type = type & ((1U << DICT_IT_BITS) - 1);
+ index->page = FIL_NULL;
+ index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+ index->n_fields = static_cast<unsigned>(n_fields)
+ & index->MAX_N_FIELDS;
+ index->n_core_fields = static_cast<unsigned>(n_fields)
+ & index->MAX_N_FIELDS;
+ /* The '1 +' above prevents allocation
+ of an empty mem block */
+ index->nulls_equal = false;
+ ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
+}
diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h
new file mode 100644
index 00000000..f1272dc4
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+ __attribute__((const));
+
+#include "dict0pagecompress.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0pagecompress.inl b/storage/innobase/include/dict0pagecompress.inl
new file mode 100644
index 00000000..c959f9ca
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.inl
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+{
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+ ut_ad(page_compression_level <= 9);
+
+ return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(dict_tf_get_page_compression(table->flags));
+
+ return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+{
+ return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ return (dict_tf_get_page_compression(table->flags));
+}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
new file mode 100644
index 00000000..0dc1b984
--- /dev/null
+++ b/storage/innobase/include/dict0stats.h
@@ -0,0 +1,238 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_option_t {
+ DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+ statistics using a precise and slow
+ algo and save them to the persistent
+ storage, if the persistent storage is
+ not present then emit a warning and
+ fall back to transient stats */
+ DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+ using an imprecise quick algo
+ without saving the results
+ persistently */
+ DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense)
+ into a table and its indexes' statistics
+ members. The resulting stats correspond to an
+ empty table. If the table is using persistent
+ statistics, then they are saved on disk. */
+ DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
+ from the persistent storage if the in-memory
+ structures have not been initialized yet,
+ otherwise do nothing */
+};
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool ps_on, /*!< in: persistent stats explicitly enabled */
+ ibool ps_off) /*!< in: persistent stats explicitly disabled */
+ MY_ATTRIBUTE((nonnull));
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool auto_recalc_on, /*!< in: explicitly enabled */
+ ibool auto_recalc_off); /*!< in: explicitly disabled */
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+ dict_table_t* table); /*!< in/out: table */
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+ dict_table_t* table) /*!< in/out: table */
+ MY_ATTRIBUTE((nonnull));
+
+#ifdef WITH_WSREP
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out] table persistent or temporary table
+@param[in] trx transaction */
+void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx)
+ MY_ATTRIBUTE((nonnull));
+#else
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out] table persistent or temporary table */
+void dict_stats_update_if_needed_func(dict_table_t *table)
+ MY_ATTRIBUTE((nonnull));
+# define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t)
+#endif
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+dberr_t
+dict_stats_update(
+/*==============*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_stats_upd_option_t stats_upd_option);
+ /*!< in: whether to (re) calc
+ the stats or to fetch them from
+ the persistent storage */
+
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name database name
+@param table_name table name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+ const char *table_name,
+ trx_t *trx)
+ MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name database name
+@param table_name table name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+ const char *table_name,
+ trx_t *trx)
+ MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name database name
+@param table_name table name
+@param index_name name of the index
+@param trx transaction (nullptr=start and commit a new one)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+ const char *table_name,
+ const char *index_name, trx_t *trx);
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+void
+dict_stats_update_for_index(
+/*========================*/
+ dict_index_t* index) /*!< in/out: index */
+ MY_ATTRIBUTE((nonnull));
+
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name old table name
+@param new_name new table name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+ trx_t *trx);
+/** Rename an index in InnoDB persistent statistics.
+@param db database name
+@param table table name
+@param old_name old table name
+@param new_name new table name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+ const char *old_name, const char *new_name,
+ trx_t *trx);
+
+/** Delete all persistent statistics for a database.
+@param db database name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx);
+
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in] index index to be updated
+@param[in] last_update timestamp of the stat
+@param[in] stat_name name of the stat
+@param[in] stat_value value of the stat
+@param[in] sample_size n pages sampled or NULL
+@param[in] stat_description description of the stat
+@param[in,out] trx transaction
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_index_stat(
+ dict_index_t* index,
+ time_t last_update,
+ const char* stat_name,
+ ib_uint64_t stat_value,
+ ib_uint64_t* sample_size,
+ const char* stat_description,
+ trx_t* trx)
+ MY_ATTRIBUTE((nonnull(1, 3, 6, 7)));
+
+/** Report an error if updating table statistics failed because
+.ibd file is missing, table decryption failed or table is corrupted.
+@param[in,out] table Table
+@param[in] defragment true if statistics is for defragment
+@retval DB_DECRYPTION_FAILED if decryption of the table failed
+@retval DB_TABLESPACE_DELETED if .ibd file is missing
+@retval DB_CORRUPTION if table is marked as corrupted */
+dberr_t
+dict_stats_report_error(dict_table_t* table, bool defragment = false)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#include "dict0stats.inl"
+
+#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
+void test_dict_stats_all();
+#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+
+#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl
new file mode 100644
index 00000000..dd516275
--- /dev/null
+++ b/storage/innobase/include/dict0stats.inl
@@ -0,0 +1,219 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.ic
+Code used for calculating and manipulating table statistics.
+
+Created Jan 23, 2012 Vasil Dimov
+*******************************************************/
+
+#include "dict0dict.h"
+#include "srv0srv.h"
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool ps_on, /*!< in: persistent stats explicitly enabled */
+ ibool ps_off) /*!< in: persistent stats explicitly disabled */
+{
+ /* Not allowed to have both flags set, but a CREATE or ALTER
+ statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
+ end up having both set. In this case we clear the OFF flag. */
+ if (ps_on && ps_off) {
+ ps_off = FALSE;
+ }
+
+ ib_uint32_t stat_persistent = 0;
+
+ if (ps_on) {
+ stat_persistent |= DICT_STATS_PERSISTENT_ON;
+ }
+
+ if (ps_off) {
+ stat_persistent |= DICT_STATS_PERSISTENT_OFF;
+ }
+
+ /* we rely on this assignment to be atomic */
+ table->stat_persistent = stat_persistent;
+}
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+{
+ /* Because of the nature of this check (non-locking) it is possible
+ that a table becomes:
+ * PS-disabled immediately after this function has returned TRUE or
+ * PS-enabled immediately after this function has returned FALSE.
+ This means that it is possible that we do:
+ + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
+ just been PS-disabled or
+ + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
+ just been PS-enabled.
+ This is acceptable. Avoiding this would mean that we would have to
+ hold dict_sys.latch or stats_mutex_lock() like for accessing the
+ other ::stat_ members which would be too big performance penalty,
+ especially when this function is called from
+ dict_stats_update_if_needed(). */
+
+ /* we rely on this read to be atomic */
+ ib_uint32_t stat_persistent = table->stat_persistent;
+
+ if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
+ ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
+ return(true);
+ } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
+ return(false);
+ } else {
+ return(srv_stats_persistent);
+ }
+}
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool auto_recalc_on, /*!< in: explicitly enabled */
+ ibool auto_recalc_off) /*!< in: explicitly disabled */
+{
+ ut_ad(!auto_recalc_on || !auto_recalc_off);
+
+ ib_uint32_t stats_auto_recalc = 0;
+
+ if (auto_recalc_on) {
+ stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
+ }
+
+ if (auto_recalc_off) {
+ stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
+ }
+
+ /* we rely on this assignment to be atomic */
+ table->stats_auto_recalc = stats_auto_recalc;
+}
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+{
+ /* we rely on this read to be atomic */
+ ib_uint32_t stats_auto_recalc = table->stats_auto_recalc;
+
+ if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
+ ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
+ return(true);
+ } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
+ return(false);
+ } else {
+ return(srv_stats_auto_recalc);
+ }
+}
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ ut_ad(!table->stats_mutex_is_owner());
+
+ if (table->stat_initialized) {
+ return;
+ }
+
+ dict_stats_upd_option_t opt;
+
+ if (dict_stats_is_persistent_enabled(table)) {
+ opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+ } else {
+ opt = DICT_STATS_RECALC_TRANSIENT;
+ }
+
+ dict_stats_update(table, opt);
+}
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ ut_ad(table->stats_mutex_is_owner());
+ ut_ad(table->get_ref_count() == 0);
+
+#ifdef HAVE_valgrind
+ if (!table->stat_initialized) {
+ return;
+ }
+
+ MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
+ MEM_UNDEFINED(&table->stat_clustered_index_size,
+ sizeof table->stat_clustered_index_size);
+ MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes,
+ sizeof table->stat_sum_of_other_index_sizes);
+ MEM_UNDEFINED(&table->stat_modified_counter,
+ sizeof table->stat_modified_counter);
+
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ MEM_UNDEFINED(
+ index->stat_n_diff_key_vals,
+ index->n_uniq
+ * sizeof index->stat_n_diff_key_vals[0]);
+ MEM_UNDEFINED(
+ index->stat_n_sample_sizes,
+ index->n_uniq
+ * sizeof index->stat_n_sample_sizes[0]);
+ MEM_UNDEFINED(
+ index->stat_n_non_null_key_vals,
+ index->n_uniq
+ * sizeof index->stat_n_non_null_key_vals[0]);
+ MEM_UNDEFINED(
+ &index->stat_index_size,
+ sizeof(index->stat_index_size));
+ MEM_UNDEFINED(
+ &index->stat_n_leaf_pages,
+ sizeof(index->stat_n_leaf_pages));
+ }
+#endif /* HAVE_valgrind */
+ table->stat_initialized = FALSE;
+}
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
new file mode 100644
index 00000000..d9a2f628
--- /dev/null
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.h
+Code used for background table and index stats gathering.
+
+Created Apr 26, 2012 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_bg_h
+#define dict0stats_bg_h
+
+#include "dict0types.h"
+
+#ifdef HAVE_PSI_INTERFACE
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive);
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread().
+Must be called before dict_stats task is started. */
+void dict_stats_init();
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit();
+
+/** Start the dict stats timer. */
+void dict_stats_start();
+
+/** Shut down the dict_stats timer. */
+void dict_stats_shutdown();
+
+/** Reschedule dict stats timer to run now. */
+void dict_stats_schedule_now();
+
+#endif /* dict0stats_bg_h */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
new file mode 100644
index 00000000..ec50e8cd
--- /dev/null
+++ b/storage/innobase/include/dict0types.h
@@ -0,0 +1,176 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+#include "univ.i"
+#include "span.h"
+#include <rem0types.h>
+
+using st_::span;
+
+struct dict_col_t;
+struct dict_field_t;
+struct dict_index_t;
+struct dict_table_t;
+struct dict_foreign_t;
+struct dict_v_col_t;
+
+struct ind_node_t;
+struct tab_node_t;
+struct dict_add_v_col_t;
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL
+
+typedef ib_id_t table_id_t;
+typedef ib_id_t index_id_t;
+
+/** Maximum transaction identifier */
+#define TRX_ID_MAX IB_ID_MAX
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+extern const byte trx_id_max_bytes[8];
+extern const byte timestamp_max_bytes[7];
+
+/** Error to ignore when we load table dictionary into memory. However,
+the table and index will be marked as "corrupted", and caller will
+be responsible to deal with corrupted table or index.
+Note: please define the IGNORE_ERR_* as bits, so their value can
+be or-ed together */
+enum dict_err_ignore_t {
+ DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */
+ DICT_ERR_IGNORE_FK_NOKEY = 1, /*!< ignore error if any foreign
+ key is missing */
+ DICT_ERR_IGNORE_INDEX = 2, /*!< ignore corrupted indexes */
+ DICT_ERR_IGNORE_RECOVER_LOCK = 4 | DICT_ERR_IGNORE_FK_NOKEY,
+ /*!< Used when recovering table locks
+ for resurrected transactions.
+ Silently load a missing
+ tablespace, and do not load
+ incomplete index definitions. */
+ /** ignore all errors above */
+ DICT_ERR_IGNORE_ALL = 7,
+ /** prepare some DDL operation;
+ do not attempt to load tablespace */
+ DICT_ERR_IGNORE_TABLESPACE = 15,
+ /** prepare to drop the table; do not attempt to load tablespace
+ or the metadata */
+ DICT_ERR_IGNORE_DROP = 31
+};
+
+/** Quiescing states for flushing tables to disk. */
+enum ib_quiesce_t {
+ QUIESCE_NONE,
+ QUIESCE_START, /*!< Initialise, prepare to start */
+ QUIESCE_COMPLETE /*!< All done */
+};
+
+/** Prefix for InnoDB internal tables, adopted from sql/table.h */
+#define TEMP_FILE_PREFIX_INNODB "#sql-ib"
+
+/** Table name wrapper for pretty-printing */
+struct table_name_t
+{
+ /** The name in internal representation */
+ char* m_name;
+
+ /** Default constructor */
+ table_name_t() = default;
+ /** Constructor */
+ table_name_t(char* name) : m_name(name) {}
+
+ /** @return the end of the schema name */
+ const char* dbend() const
+ {
+ const char* sep = strchr(m_name, '/');
+ ut_ad(sep);
+ return sep;
+ }
+
+ /** @return the length of the schema name, in bytes */
+ size_t dblen() const { return size_t(dbend() - m_name); }
+
+ /** Determine the filename-safe encoded table name.
+ @return the filename-safe encoded table name */
+ const char* basename() const { return dbend() + 1; }
+
+ /** The start of the table basename suffix for partitioned tables */
+ static const char part_suffix[4];
+
+ /** Determine the partition or subpartition name suffix.
+ @return the partition name
+ @retval NULL if the table is not partitioned */
+ const char* part() const { return strstr(basename(), part_suffix); }
+
+ /** @return whether this is a temporary or intermediate table name */
+ inline bool is_temporary() const;
+};
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Dump the change buffer at startup */
+extern my_bool ibuf_dump;
+/** Flag to control insert buffer debugging. */
+extern uint ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** Shift for spatial status */
+#define SPATIAL_STATUS_SHIFT 12
+
+/** Mask to encode/decode spatial status. */
+#define SPATIAL_STATUS_MASK (3U << SPATIAL_STATUS_SHIFT)
+
+#if SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+# error SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+#endif
+
+/** whether a col is used in spatial index or regular index
+Note: the spatial status is part of persistent undo log,
+so we should not modify the values in MySQL 5.7 */
+enum spatial_status_t {
+ /* Unkown status (undo format in 5.7.9) */
+ SPATIAL_UNKNOWN = 0,
+
+ /** Not used in gis index. */
+ SPATIAL_NONE = 1,
+
+ /** Used in both spatial index and regular index. */
+ SPATIAL_MIXED = 2,
+
+ /** Only used in spatial index. */
+ SPATIAL_ONLY = 3
+};
+
+#define TABLE_STATS_NAME "mysql/innodb_table_stats"
+#define INDEX_STATS_NAME "mysql/innodb_index_stats"
+
+#endif
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
new file mode 100644
index 00000000..06af4dcc
--- /dev/null
+++ b/storage/innobase/include/dyn0buf.h
@@ -0,0 +1,442 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0buf.h
+The dynamically allocated buffer implementation
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0buf_h
+#define dyn0buf_h
+
+#include "mem0mem.h"
+#include "dyn0types.h"
+#include "ilist.h"
+
+
+/** Class that manages dynamic buffers. It uses a UT_LIST of
+mtr_buf_t::block_t instances. We don't use STL containers in
+order to avoid the overhead of heap calls. Using a custom memory
+allocator doesn't solve the problem either because we have to get
+the memory from somewhere. We can't use the block_t::m_data as the
+backend for the custom allocator because we would like the data in
+the blocks to be contiguous. */
+class mtr_buf_t {
+public:
+ /** SIZE - sizeof(m_node) + sizeof(m_used) */
+ enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE
+ - sizeof(ilist_node<>) + sizeof(uint32_t) };
+
+ class block_t : public ilist_node<> {
+ public:
+
+ block_t()
+ {
+ compile_time_assert(MAX_DATA_SIZE <= (2 << 15));
+ init();
+ }
+
+ /**
+ Gets the number of used bytes in a block.
+ @return number of bytes used */
+ ulint used() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG));
+ }
+
+ /**
+ Gets pointer to the start of data.
+ @return pointer to data */
+ byte* start()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_data);
+ }
+
+ /**
+ @return start of data - non const version */
+ byte* begin()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_data);
+ }
+
+ /**
+ @return end of used data - non const version */
+ byte* end()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(begin() + m_used);
+ }
+
+ /**
+ @return start of data - const version */
+ const byte* begin() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_data);
+ }
+
+ /**
+ @return end of used data - const version */
+ const byte* end() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(begin() + m_used);
+ }
+
+ private:
+ /**
+ @return pointer to start of reserved space */
+ template <typename Type>
+ Type push(uint32_t size)
+ {
+ Type ptr = reinterpret_cast<Type>(end());
+
+ m_used += size;
+ ut_ad(m_used <= uint32_t(MAX_DATA_SIZE));
+
+ return(ptr);
+ }
+
+ /**
+ Grow the stack. */
+ void close(const byte* ptr)
+ {
+ /* Check that it is within bounds */
+ ut_ad(ptr >= begin());
+ ut_ad(ptr <= begin() + m_buf_end);
+
+ /* We have done the boundary check above */
+ m_used = uint32_t(ptr - begin());
+
+ ut_ad(m_used <= MAX_DATA_SIZE);
+ ut_d(m_buf_end = 0);
+ }
+
+ /**
+ Initialise the block */
+ void init()
+ {
+ m_used = 0;
+ ut_d(m_buf_end = 0);
+ ut_d(m_magic_n = DYN_BLOCK_MAGIC_N);
+ }
+ private:
+#ifdef UNIV_DEBUG
+ /** If opened then this is the buffer end offset, else 0 */
+ ulint m_buf_end;
+
+ /** Magic number (DYN_BLOCK_MAGIC_N) */
+ ulint m_magic_n;
+#endif /* UNIV_DEBUG */
+
+ /** Storage */
+ byte m_data[MAX_DATA_SIZE];
+
+ /** number of data bytes used in this block;
+ DYN_BLOCK_FULL_FLAG is set when the block becomes full */
+ uint32_t m_used;
+
+ friend class mtr_buf_t;
+ };
+
+ typedef sized_ilist<block_t> list_t;
+
+ /** Default constructor */
+ mtr_buf_t()
+ :
+ m_heap(),
+ m_size()
+ {
+ push_back(&m_first_block);
+ }
+
+ /** Destructor */
+ ~mtr_buf_t()
+ {
+ erase();
+ }
+
+ /** Reset the buffer vector */
+ void erase()
+ {
+ if (m_heap != NULL) {
+ mem_heap_free(m_heap);
+ m_heap = NULL;
+
+ /* Initialise the list and add the first block. */
+ m_list.clear();
+ m_list.push_back(m_first_block);
+ } else {
+ m_first_block.init();
+ ut_ad(m_list.size() == 1);
+ }
+
+ m_size = 0;
+ }
+
+ /**
+ Makes room on top and returns a pointer to a buffer in it. After
+ copying the elements, the caller must close the buffer using close().
+ @param size in bytes of the buffer; MUST be <= MAX_DATA_SIZE!
+ @return pointer to the buffer */
+ byte* open(ulint size)
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ ut_ad(size > 0);
+ ut_ad(size <= MAX_DATA_SIZE);
+
+ block_t* block;
+
+ block = has_space(size) ? back() : add_block();
+
+ ut_ad(block->m_used <= MAX_DATA_SIZE);
+ ut_d(block->m_buf_end = block->m_used + size);
+
+ return(block->end());
+ }
+
+ /**
+ Closes the buffer returned by open.
+ @param ptr end of used space */
+ void close(const byte* ptr)
+ {
+ ut_ad(!m_list.empty());
+ block_t* block = back();
+
+ m_size -= block->used();
+
+ block->close(ptr);
+
+ m_size += block->used();
+ }
+
+ /**
+ Makes room on top and returns a pointer to the added element.
+ The caller must copy the element to the pointer returned.
+ @param size in bytes of the element
+ @return pointer to the element */
+ template <typename Type>
+ Type push(uint32_t size)
+ {
+ ut_ad(size > 0);
+ ut_ad(size <= MAX_DATA_SIZE);
+
+ block_t* block;
+
+ block = has_space(size) ? back() : add_block();
+
+ m_size += size;
+
+ /* See ISO C++03 14.2/4 for why "template" is required. */
+
+ return(block->template push<Type>(size));
+ }
+
+ /**
+ Pushes n bytes.
+ @param str string to write
+ @param len string length */
+ void push(const byte* ptr, uint32_t len)
+ {
+ while (len > 0) {
+ uint32_t n_copied = std::min(len,
+ uint32_t(MAX_DATA_SIZE));
+ ::memmove(push<byte*>(n_copied), ptr, n_copied);
+
+ ptr += n_copied;
+ len -= n_copied;
+ }
+ }
+
+ /**
+ Returns a pointer to an element in the buffer. const version.
+ @param pos position of element in bytes from start
+ @return pointer to element */
+ template <typename Type>
+ const Type at(ulint pos) const
+ {
+ block_t* block = const_cast<block_t*>(
+ const_cast<mtr_buf_t*>(this)->find(pos));
+
+ return(reinterpret_cast<Type>(block->begin() + pos));
+ }
+
+ /**
+ Returns a pointer to an element in the buffer. non const version.
+ @param pos position of element in bytes from start
+ @return pointer to element */
+ template <typename Type>
+ Type at(ulint pos)
+ {
+ block_t* block = const_cast<block_t*>(find(pos));
+
+ return(reinterpret_cast<Type>(block->begin() + pos));
+ }
+
+ /**
+ Returns the size of the total stored data.
+ @return data size in bytes */
+ ulint size() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+#ifdef UNIV_DEBUG
+ ulint total_size = 0;
+
+ for (list_t::iterator it = m_list.begin(), end = m_list.end();
+ it != end; ++it) {
+ total_size += it->used();
+ }
+
+ ut_ad(total_size == m_size);
+#endif /* UNIV_DEBUG */
+ return(m_size);
+ }
+
+ /**
+ Iterate over each block and call the functor.
+ @return false if iteration was terminated. */
+ template <typename Functor>
+ bool for_each_block(const Functor& functor) const
+ {
+ for (list_t::iterator it = m_list.begin(), end = m_list.end();
+ it != end; ++it) {
+
+ if (!functor(&*it)) {
+ return false;
+ }
+ }
+
+ return(true);
+ }
+
+ /**
+ @return the first block */
+ block_t* front()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return &m_list.front();
+ }
+
+ /**
+ @return true if m_first_block block was not filled fully */
+ bool is_small() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_heap == NULL);
+ }
+
+ /** @return whether the buffer is empty */
+ bool empty() const { return !back()->m_used; }
+
+private:
+ // Disable copying
+ mtr_buf_t(const mtr_buf_t&);
+ mtr_buf_t& operator=(const mtr_buf_t&);
+
+ /**
+ Add the block to the end of the list*/
+ void push_back(block_t* block)
+ {
+ block->init();
+ m_list.push_back(*block);
+ }
+
+ /** @return the last block in the list */
+ block_t* back() const
+ {
+ return &const_cast<block_t&>(m_list.back());
+ }
+
+ /*
+ @return true if request can be fullfilled */
+ bool has_space(ulint size) const
+ {
+ return(back()->m_used + size <= MAX_DATA_SIZE);
+ }
+
+ /*
+ @return true if request can be fullfilled */
+ bool has_space(ulint size)
+ {
+ return(back()->m_used + size <= MAX_DATA_SIZE);
+ }
+
+ /** Find the block that contains the pos.
+ @param pos absolute offset, it is updated to make it relative
+ to the block
+ @return the block containing the pos. */
+ block_t* find(ulint& pos)
+ {
+ ut_ad(!m_list.empty());
+
+ for (list_t::iterator it = m_list.begin(), end = m_list.end();
+ it != end; ++it) {
+
+ if (pos < it->used()) {
+ ut_ad(it->used() >= pos);
+
+ return &*it;
+ }
+
+ pos -= it->used();
+ }
+
+ return NULL;
+ }
+
+ /**
+ Allocate and add a new block to m_list */
+ block_t* add_block()
+ {
+ block_t* block;
+
+ if (m_heap == NULL) {
+ m_heap = mem_heap_create(sizeof(*block));
+ }
+
+ block = reinterpret_cast<block_t*>(
+ mem_heap_alloc(m_heap, sizeof(*block)));
+
+ push_back(block);
+
+ return(block);
+ }
+
+private:
+ /** Heap to use for memory allocation */
+ mem_heap_t* m_heap;
+
+ /** Allocated blocks */
+ list_t m_list;
+
+ /** Total size used by all blocks */
+ ulint m_size;
+
+ /** The default block, should always be the first element. This
+ is for backwards compatibility and to avoid an extra heap allocation
+ for small REDO log records */
+ block_t m_first_block;
+};
+
+#endif /* dyn0buf_h */
diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h
new file mode 100644
index 00000000..83d0b0d6
--- /dev/null
+++ b/storage/innobase/include/dyn0types.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0types.h
+The dynamically allocated buffer types and constants
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0types_h
+#define dyn0types_h
+
+/** Value of dyn_block_t::magic_n */
+#define DYN_BLOCK_MAGIC_N 375767
+
+/** This is the initial 'payload' size of a dynamic array */
+#define DYN_ARRAY_DATA_SIZE 512
+
+/** Flag for dyn_block_t::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG 0x1000000UL
+
+#endif /* dyn0types_h */
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
new file mode 100644
index 00000000..a3ea0462
--- /dev/null
+++ b/storage/innobase/include/eval0eval.h
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node); /*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node); /*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val); /*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node); /*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len); /*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2); /*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+ func_node_t* cmp_node); /*!< in: comparison node */
+
+
+#include "eval0eval.inl"
+
+#endif
diff --git a/storage/innobase/include/eval0eval.inl b/storage/innobase/include/eval0eval.inl
new file mode 100644
index 00000000..0ea4057f
--- /dev/null
+++ b/storage/innobase/include/eval0eval.inl
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+void
+eval_func(
+/*======*/
+ func_node_t* func_node); /*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size); /*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /*!< in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+ dfield_set_len(dfield, size);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (!data || que_node_get_val_buf_size(node) < size) {
+
+ data = eval_node_alloc_val_buf(node, size);
+ }
+
+ return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node) /*!< in: symbol table node */
+{
+
+ ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ if (sym_node->indirection) {
+ /* The symbol table node is an alias for a variable or a
+ column */
+
+ dfield_copy_data(que_node_get_val(sym_node),
+ que_node_get_val(sym_node->indirection));
+ }
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node) /*!< in: expression */
+{
+ if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+ eval_sym((sym_node_t*) exp_node);
+
+ return;
+ }
+
+ eval_func(static_cast<func_node_t*>(exp_node));
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (data == NULL) {
+ data = eval_node_alloc_val_buf(node, 4);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ mach_write_to_4(data, (ulint) val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node) /*!< in: expression node */
+{
+ const byte* ptr;
+ dfield_t* dfield;
+
+ dfield = que_node_get_val(node);
+ ptr = static_cast<byte*>(dfield_get_data(dfield));
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ return((int) mach_read_from_4(ptr));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ ut_ad(data != NULL);
+
+ return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+ func_node_t* func_node, /*!< in: function node */
+ ibool val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(func_node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (data == NULL) {
+ /* Allocate 1 byte to hold the value */
+
+ data = eval_node_alloc_val_buf(func_node, 1);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 1);
+
+ mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len) /*!< in: string length or UNIV_SQL_NULL */
+{
+ byte* data;
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_len(que_node_get_val(node), len);
+
+ return;
+ }
+
+ data = eval_node_ensure_val_buf(node, len);
+
+ memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2) /*!< in: node to copy from */
+{
+ dfield_t* dfield2;
+
+ dfield2 = que_node_get_val(node2);
+
+ eval_node_copy_and_alloc_val(
+ node1,
+ static_cast<byte*>(dfield_get_data(dfield2)),
+ dfield_get_len(dfield2));
+}
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
new file mode 100644
index 00000000..a93140bf
--- /dev/null
+++ b/storage/innobase/include/eval0proc.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+if_step(
+/*====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+while_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+que_thr_t*
+for_step(
+/*=====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+assign_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+exit_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+return_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+#include "eval0proc.inl"
+
+#endif
diff --git a/storage/innobase/include/eval0proc.inl b/storage/innobase/include/eval0proc.inl
new file mode 100644
index 00000000..b0c5f75b
--- /dev/null
+++ b/storage/innobase/include/eval0proc.inl
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ proc_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<proc_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ /* Start execution from the first statement in the statement
+ list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ func_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<func_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ /* Evaluate the procedure */
+
+ eval_exp(node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
new file mode 100644
index 00000000..f43965cd
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.h
@@ -0,0 +1,396 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.h
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0crypt_h
+#define fil0crypt_h
+
+#include "my_crypt.h"
+#include "fil0fil.h"
+
+/**
+* Magic pattern in start of crypt data on page 0
+*/
+#define MAGIC_SZ 6
+
+static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
+ 's', 0xE, 0xC, 'R', 'E', 't' };
+
+/* This key will be used if nothing else is given */
+#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
+
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast= false);
+
+/**
+ * CRYPT_SCHEME_UNENCRYPTED
+ *
+ * Used as intermediate state when convering a space from unencrypted
+ * to encrypted
+ */
+/**
+ * CRYPT_SCHEME_1
+ *
+ * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths)
+ * L = AES_ECB(KEY, IV)
+ * CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE)
+ */
+
+#define CRYPT_SCHEME_1 1
+#define CRYPT_SCHEME_1_IV_LEN 16
+#define CRYPT_SCHEME_UNENCRYPTED 0
+
+/* Cached L or key for given key_version */
+struct key_struct
+{
+ uint key_version; /*!< Version of the key */
+ uint key_length; /*!< Key length */
+ unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key
+ (that is L in CRYPT_SCHEME_1) */
+};
+
+/** is encryption enabled */
+extern ulong srv_encrypt_tables;
+
+/** Mutex helper for crypt_data->scheme
+@param[in, out] schme encryption scheme
+@param[in] exit should we exit or enter mutex ? */
+void
+crypt_data_scheme_locker(
+ st_encryption_scheme* scheme,
+ int exit);
+
+struct fil_space_rotate_state_t
+{
+ time_t start_time; /*!< time when rotation started */
+ ulint active_threads; /*!< active threads in space */
+ uint32_t next_offset; /*!< next "free" offset */
+ uint32_t max_offset; /*!< max offset needing to be rotated */
+ uint min_key_version_found; /*!< min key version found but not
+ rotated */
+ lsn_t end_lsn; /*!< max lsn created when rotating this
+ space */
+ bool starting; /*!< initial write of IV */
+ bool flushing; /*!< space is being flushed at end of rotate */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+
+struct fil_space_crypt_t : st_encryption_scheme
+{
+ public:
+ /** Constructor. Does not initialize the members!
+ The object is expected to be placed in a buffer that
+ has been zero-initialized. */
+ fil_space_crypt_t(
+ uint new_type,
+ uint new_min_key_version,
+ uint new_key_id,
+ fil_encryption_t new_encryption)
+ : st_encryption_scheme(),
+ min_key_version(new_min_key_version),
+ encryption(new_encryption),
+ key_found(0),
+ rotate_state()
+ {
+ key_id = new_key_id;
+ my_random_bytes(iv, sizeof(iv));
+ mysql_mutex_init(0, &mutex, nullptr);
+ locker = crypt_data_scheme_locker;
+ type = new_type;
+
+ if (new_encryption == FIL_ENCRYPTION_OFF ||
+ (!srv_encrypt_tables &&
+ new_encryption == FIL_ENCRYPTION_DEFAULT)) {
+ type = CRYPT_SCHEME_UNENCRYPTED;
+ } else {
+ type = CRYPT_SCHEME_1;
+ min_key_version = key_get_latest_version();
+ }
+
+ key_found = min_key_version;
+ }
+
+ /** Destructor */
+ ~fil_space_crypt_t()
+ {
+ mysql_mutex_destroy(&mutex);
+ }
+
+ /** Get latest key version from encryption plugin
+ @retval key_version or
+ @retval ENCRYPTION_KEY_VERSION_INVALID if used key_id
+ is not found from encryption plugin. */
+ uint key_get_latest_version(void);
+
+ /** Returns true if key was found from encryption plugin
+ and false if not. */
+ bool is_key_found() const {
+ return key_found != ENCRYPTION_KEY_VERSION_INVALID;
+ }
+
+ /** Returns true if tablespace should be encrypted */
+ bool should_encrypt() const {
+ return ((encryption == FIL_ENCRYPTION_ON) ||
+ (srv_encrypt_tables &&
+ encryption == FIL_ENCRYPTION_DEFAULT));
+ }
+
+ /** Return true if tablespace is encrypted. */
+ bool is_encrypted() const {
+ return (encryption != FIL_ENCRYPTION_OFF);
+ }
+
+ /** Return true if default tablespace encryption is used, */
+ bool is_default_encryption() const {
+ return (encryption == FIL_ENCRYPTION_DEFAULT);
+ }
+
+ /** Return true if tablespace is not encrypted. */
+ bool not_encrypted() const {
+ return (encryption == FIL_ENCRYPTION_OFF);
+ }
+
+ /** Write encryption metadata to the first page.
+ @param[in,out] block first page of the tablespace
+ @param[in,out] mtr mini-transaction */
+ void write_page0(buf_block_t* block, mtr_t* mtr);
+
+ uint min_key_version; // min key version for this space
+ fil_encryption_t encryption; // Encryption setup
+
+ mysql_mutex_t mutex; // mutex protecting following variables
+
+ /** Return code from encryption_key_get_latest_version.
+ If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
+ could not find the key and there is no need to call
+ get_latest_key_version again as keys are read only
+ at startup. */
+ uint key_found;
+
+ fil_space_rotate_state_t rotate_state;
+};
+
+/** Status info about encryption */
+struct fil_space_crypt_status_t {
+ ulint space; /*!< tablespace id */
+ ulint scheme; /*!< encryption scheme */
+ uint min_key_version; /*!< min key version */
+ uint current_key_version;/*!< current key version */
+ uint keyserver_requests;/*!< no of key requests to key server */
+ uint key_id; /*!< current key_id */
+ bool rotating; /*!< is key rotation ongoing */
+ bool flushing; /*!< is flush at end of rotation ongoing */
+ ulint rotate_next_page_number; /*!< next page if key rotating */
+ ulint rotate_max_page_number; /*!< max page if key rotating */
+};
+
+/** Statistics about encryption key rotation */
+struct fil_crypt_stat_t
+{
+ ulint pages_read_from_cache= 0;
+ ulint pages_read_from_disk= 0;
+ ulint pages_modified= 0;
+ ulint pages_flushed= 0;
+ ulint estimated_iops= 0;
+};
+
+/** Init space crypt */
+void fil_space_crypt_init();
+
+/** Cleanup space crypt */
+void fil_space_crypt_cleanup();
+
+/**
+Create a fil_space_crypt_t object
+@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or
+ FIL_ENCRYPTION_ON or
+ FIL_ENCRYPTION_OFF
+
+@param[in] key_id Encryption key id
+@return crypt object */
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+ fil_encryption_t encrypt_mode,
+ uint key_id)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] page first page of the tablespace
+@return crypt data from page 0
+@retval NULL if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**
+Free a crypt data object
+@param[in,out] crypt_data crypt data to be freed */
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data);
+
+/** Amend encryption information from redo log.
+@param[in] space tablespace
+@param[in] data encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data);
+
+/** Encrypt a buffer.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] dst_frame Output buffer
+@param[in] use_full_checksum full crc32 algo is used
+@return encrypted buffer or NULL */
+byte*
+fil_encrypt_buf(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ const byte* src_frame,
+ ulint zip_size,
+ byte* dst_frame,
+ bool use_full_checksum)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Encrypt a page.
+
+@param[in] space Tablespace
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+ const fil_space_t* space,
+ ulint offset,
+ byte* src_frame,
+ byte* dst_frame)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrypt a page.
+@param]in] space_id space id
+@param[in] fsp_flags Tablespace flags
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in] physical_size page size
+@param[in,out] src_frame Page to decrypt
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
+dberr_t
+fil_space_decrypt(
+ uint32_t space_id,
+ uint32_t fsp_flags,
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ ulint physical_size,
+ byte* src_frame);
+
+/******************************************************************
+Decrypt a page
+@param[in] space Tablespace
+@param[in] tmp_frame Temporary buffer used for decrypting
+@param[in,out] src_frame Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.
+@retval nullptr on failure */
+byte*
+fil_space_decrypt(
+ const fil_space_t* space,
+ byte* tmp_frame,
+ byte* src_frame)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in] enw_cnt Number of threads to be used */
+void fil_crypt_set_thread_cnt(const uint new_cnt);
+
+/*********************************************************************
+Adjust max key age
+@param[in] val New max key age */
+void fil_crypt_set_rotate_key_age(uint val);
+
+/*********************************************************************
+Adjust rotation iops
+@param[in] val New max roation iops */
+void fil_crypt_set_rotation_iops(uint val);
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in] val New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val);
+
+/*********************************************************************
+Init threads for key rotation */
+void fil_crypt_threads_init();
+
+/*********************************************************************
+Clean up key rotation threads resources */
+void fil_crypt_threads_cleanup();
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in] space Tablespace */
+void fil_space_crypt_close_tablespace(const fil_space_t *space);
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in] space Tablespace
+@param[out] status Crypt status
+return 0 if crypt data present */
+void
+fil_space_crypt_get_status(
+ const fil_space_t* space,
+ struct fil_space_crypt_status_t* status);
+
+/*********************************************************************
+Return crypt statistics
+@param[out] stat Crypt statistics */
+void fil_crypt_total_stat(fil_crypt_stat_t *stat);
+
+#include "fil0crypt.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out] page page frame (checksum is temporarily modified)
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the tablespace to the rotation list if
+innodb_encrypt_rotate_key_age is 0 or encryption plugin does
+not do key version rotation
+@return whether the tablespace should be added to rotation list */
+bool fil_crypt_must_default_encrypt();
+
+#endif /* fil0crypt_h */
diff --git a/storage/innobase/include/fil0crypt.inl b/storage/innobase/include/fil0crypt.inl
new file mode 100644
index 00000000..cc59b394
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.inl
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (c) 2015, 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.ic
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+/*******************************************************************//**
+Find out whether the page is page encrypted
+@return true if page is page encrypted, false if not */
+UNIV_INLINE
+bool
+fil_page_is_encrypted(
+/*==================*/
+ const byte *buf) /*!< in: page */
+{
+ return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0);
+}
+
+/*******************************************************************//**
+Get current encryption mode from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_mode(
+/*===============*/
+ const fil_space_crypt_t* crypt_data)
+{
+ switch (crypt_data->encryption) {
+ case FIL_ENCRYPTION_DEFAULT:
+ return("Default tablespace encryption mode");
+ case FIL_ENCRYPTION_ON:
+ return("Tablespace encrypted");
+ case FIL_ENCRYPTION_OFF:
+ return("Tablespace not encrypted");
+ }
+
+ ut_error;
+ return ("NULL");
+}
+
+/*******************************************************************//**
+Get current encryption type from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_type(
+ const fil_space_crypt_t* crypt_data)
+{
+ ut_ad(crypt_data != NULL);
+ switch (crypt_data->type) {
+ case CRYPT_SCHEME_UNENCRYPTED:
+ return("scheme unencrypted");
+ break;
+ case CRYPT_SCHEME_1:
+ return("scheme encrypted");
+ break;
+ default:
+ ut_error;
+ }
+
+ return ("NULL");
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
new file mode 100644
index 00000000..6f58e3c1
--- /dev/null
+++ b/storage/innobase/include/fil0fil.h
@@ -0,0 +1,1823 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "fsp0types.h"
+#include "mach0data.h"
+#include "assume_aligned.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srw_lock.h"
+#include "buf0dblwr.h"
+#include "hash0hash.h"
+#include "log0recv.h"
+#include "dict0types.h"
+#include "ilist.h"
+#include <set>
+#include <mutex>
+
+struct unflushed_spaces_tag_t;
+struct default_encrypt_tag_t;
+struct space_list_tag_t;
+struct named_spaces_tag_t;
+
+using space_list_t= ilist<fil_space_t, space_list_tag_t>;
+
+// Forward declaration
+extern my_bool srv_use_doublewrite_buf;
+
+/** Possible values of innodb_flush_method */
+enum srv_flush_t
+{
+ /** fsync, the default */
+ SRV_FSYNC= 0,
+ /** open log files in O_DSYNC mode */
+ SRV_O_DSYNC,
+ /** do not call os_file_flush() when writing data files, but do flush
+ after writing to log files */
+ SRV_LITTLESYNC,
+ /** do not flush after writing */
+ SRV_NOSYNC,
+ /** invoke os_file_set_nocache() on data files. This implies using
+ unbuffered I/O but still fdatasync(), because some filesystems might
+ not flush meta-data on write completion */
+ SRV_O_DIRECT,
+ /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
+ durable on write completion */
+ SRV_O_DIRECT_NO_FSYNC
+#ifdef _WIN32
+ /** Traditional Windows appoach to open all files without caching,
+ and do FileFlushBuffers() */
+ ,SRV_ALL_O_DIRECT_FSYNC
+#endif
+};
+
+/** innodb_flush_method */
+extern ulong srv_file_flush_method;
+
+/** Undo tablespaces starts with space_id. */
+extern uint32_t srv_undo_space_id_start;
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern uint32_t srv_undo_tablespaces_open;
+
+/** Check whether given space id is undo tablespace id
+@param[in] space_id space id to check
+@return true if it is undo tablespace else false. */
+inline bool srv_is_undo_tablespace(uint32_t space_id)
+{
+ return srv_undo_space_id_start > 0 &&
+ space_id >= srv_undo_space_id_start &&
+ space_id < srv_undo_space_id_start + srv_undo_tablespaces_open;
+}
+
+class page_id_t;
+
+/** Structure containing encryption specification */
+struct fil_space_crypt_t;
+
+/** File types */
+enum fil_type_t {
+ /** temporary tablespace (temporary undo log or tables) */
+ FIL_TYPE_TEMPORARY,
+ /** a tablespace that is being imported (no logging until finished) */
+ FIL_TYPE_IMPORT,
+ /** persistent tablespace (for system, undo log or tables) */
+ FIL_TYPE_TABLESPACE,
+};
+
+struct fil_node_t;
+
+/** Structure to store first and last value of range */
+struct range_t
+{
+ uint32_t first;
+ uint32_t last;
+};
+
+/** Sort the range based on first value of the range */
+struct range_compare
+{
+ bool operator() (const range_t lhs, const range_t rhs) const
+ {
+ return lhs.first < rhs.first;
+ }
+};
+
+using range_set_t= std::set<range_t, range_compare>;
+/** Range to store the set of ranges of integers */
+class range_set
+{
+private:
+ range_set_t ranges;
+
+ range_set_t::iterator find(uint32_t value) const
+ {
+ auto r_offset= ranges.lower_bound({value, value});
+ const auto r_end= ranges.end();
+ if (r_offset != r_end);
+ else if (empty())
+ return r_end;
+ else
+ r_offset= std::prev(r_end);
+ if (r_offset->first <= value && r_offset->last >= value)
+ return r_offset;
+ return r_end;
+ }
+public:
+ /** Merge the current range with previous range.
+ @param[in] range range to be merged
+ @param[in] prev_range range to be merged with next */
+ void merge_range(range_set_t::iterator range,
+ range_set_t::iterator prev_range)
+ {
+ if (range->first != prev_range->last + 1)
+ return;
+
+ /* Merge the current range with previous range */
+ range_t new_range {prev_range->first, range->last};
+ ranges.erase(prev_range);
+ ranges.erase(range);
+ ranges.emplace(new_range);
+ }
+
+ /** Split the range and add two more ranges
+ @param[in] range range to be split
+ @param[in] value Value to be removed from range */
+ void split_range(range_set_t::iterator range, uint32_t value)
+ {
+ range_t split1{range->first, value - 1};
+ range_t split2{value + 1, range->last};
+
+ /* Remove the existing element */
+ ranges.erase(range);
+
+ /* Insert the two elements */
+ ranges.emplace(split1);
+ ranges.emplace(split2);
+ }
+
+ /** Remove the value with the given range
+ @param[in,out] range range to be changed
+ @param[in] value value to be removed */
+ void remove_within_range(range_set_t::iterator range, uint32_t value)
+ {
+ range_t new_range{range->first, range->last};
+ if (value == range->first)
+ {
+ if (range->first == range->last)
+ {
+ ranges.erase(range);
+ return;
+ }
+ else
+ new_range.first++;
+ }
+ else if (value == range->last)
+ new_range.last--;
+ else if (range->first < value && range->last > value)
+ return split_range(range, value);
+
+ ranges.erase(range);
+ ranges.emplace(new_range);
+ }
+
+ /** Remove the value from the ranges.
+ @param[in] value Value to be removed. */
+ void remove_value(uint32_t value)
+ {
+ if (empty())
+ return;
+ range_t new_range {value, value};
+ range_set_t::iterator range= ranges.lower_bound(new_range);
+ if (range == ranges.end())
+ return remove_within_range(std::prev(range), value);
+
+ if (range->first > value && range != ranges.begin())
+ /* Iterate the previous ranges to delete */
+ return remove_within_range(std::prev(range), value);
+ return remove_within_range(range, value);
+ }
+ /** Add the value within the existing range
+ @param[in] range range to be modified
+ @param[in] value value to be added */
+ range_set_t::iterator add_within_range(range_set_t::iterator range,
+ uint32_t value)
+ {
+ if (range->first <= value && range->last >= value)
+ return range;
+
+ range_t new_range{range->first, range->last};
+ if (range->last + 1 == value)
+ new_range.last++;
+ else if (range->first - 1 == value)
+ new_range.first--;
+ else return ranges.end();
+ ranges.erase(range);
+ return ranges.emplace(new_range).first;
+ }
+ /** Add the range in the ranges set
+ @param[in] new_range range to be added */
+ void add_range(range_t new_range)
+ {
+ auto r_offset= ranges.lower_bound(new_range);
+ auto r_begin= ranges.begin();
+ auto r_end= ranges.end();
+ if (!ranges.size())
+ {
+new_range:
+ ranges.emplace(new_range);
+ return;
+ }
+
+ if (r_offset == r_end)
+ {
+ /* last range */
+ if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+ goto new_range;
+ }
+ else if (r_offset == r_begin)
+ {
+ /* First range */
+ if (add_within_range(r_offset, new_range.first) == r_end)
+ goto new_range;
+ }
+ else if (r_offset->first - 1 == new_range.first)
+ {
+ /* Change starting of the existing range */
+ auto r_value= add_within_range(r_offset, new_range.first);
+ if (r_value != ranges.begin())
+ merge_range(r_value, std::prev(r_value));
+ }
+ else
+ {
+ /* previous range last_value alone */
+ if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+ goto new_range;
+ }
+ }
+
+ /** Add the value in the ranges
+ @param[in] value value to be added */
+ void add_value(uint32_t value)
+ {
+ range_t new_range{value, value};
+ add_range(new_range);
+ }
+
+ bool remove_if_exists(uint32_t value)
+ {
+ auto r_offset= find(value);
+ if (r_offset != ranges.end())
+ {
+ remove_within_range(r_offset, value);
+ return true;
+ }
+ return false;
+ }
+
+ bool contains(uint32_t value) const
+ {
+ return find(value) != ranges.end();
+ }
+
+ ulint size() { return ranges.size(); }
+ void clear() { ranges.clear(); }
+ bool empty() const { return ranges.empty(); }
+ typename range_set_t::iterator begin() { return ranges.begin(); }
+ typename range_set_t::iterator end() { return ranges.end(); }
+};
+#endif
+
+/** Tablespace or log data space */
+#ifndef UNIV_INNOCHECKSUM
+struct fil_io_t
+{
+ /** error code */
+ dberr_t err;
+ /** file; node->space->release() must follow IORequestRead call */
+ fil_node_t *node;
+};
+
+/** Tablespace encryption mode */
+enum fil_encryption_t
+{
+ /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
+ FIL_ENCRYPTION_DEFAULT,
+ /** Encrypted */
+ FIL_ENCRYPTION_ON,
+ /** Not encrypted */
+ FIL_ENCRYPTION_OFF
+};
+
+struct fil_space_t final : ilist_node<unflushed_spaces_tag_t>,
+ ilist_node<default_encrypt_tag_t>,
+ ilist_node<space_list_tag_t>,
+ ilist_node<named_spaces_tag_t>
+#else
+struct fil_space_t final
+#endif
+{
+#ifndef UNIV_INNOCHECKSUM
+ friend fil_node_t;
+ ~fil_space_t()
+ {
+ ut_ad(!latch_owner);
+ ut_ad(!latch_count);
+ latch.destroy();
+ }
+
+ /** fil_system.spaces chain node */
+ fil_space_t *hash;
+ /** LSN of the most recent fil_names_write_if_was_clean().
+ Reset to 0 by fil_names_clear(). Protected by exclusive log_sys.latch.
+ If and only if max_lsn is nonzero, this is in fil_system.named_spaces. */
+ lsn_t max_lsn;
+ /** tablespace identifier */
+ uint32_t id;
+ /** whether undo tablespace truncation is in progress */
+ bool is_being_truncated;
+ fil_type_t purpose;/*!< purpose */
+ UT_LIST_BASE_NODE_T(fil_node_t) chain;
+ /*!< base node for the file chain */
+ uint32_t size; /*!< tablespace file size in pages;
+ 0 if not known yet */
+ uint32_t size_in_header;
+ /* FSP_SIZE in the tablespace header;
+ 0 if not known yet */
+ uint32_t free_len;
+ /*!< length of the FSP_FREE list */
+ uint32_t free_limit;
+ /*!< contents of FSP_FREE_LIMIT */
+ uint32_t recv_size;
+ /*!< recovered tablespace size in pages;
+ 0 if no size change was read from the redo log,
+ or if the size change was implemented */
+ uint32_t n_reserved_extents;
+ /*!< number of reserved free extents for
+ ongoing operations like B-tree page split */
+private:
+#ifdef UNIV_DEBUG
+ fil_space_t *next_in_space_list();
+ fil_space_t *prev_in_space_list();
+
+ fil_space_t *next_in_unflushed_spaces();
+ fil_space_t *prev_in_unflushed_spaces();
+#endif
+
+ /** the committed size of the tablespace in pages */
+ Atomic_relaxed<uint32_t> committed_size;
+ /** Number of pending operations on the file.
+ The tablespace cannot be freed while (n_pending & PENDING) != 0. */
+ std::atomic<uint32_t> n_pending;
+ /** Flag in n_pending that indicates that the tablespace is about to be
+ deleted, and no further operations should be performed */
+ static constexpr uint32_t STOPPING_READS= 1U << 31;
+ /** Flag in n_pending that indicates that the tablespace is being
+ deleted, and no further operations should be performed */
+ static constexpr uint32_t STOPPING_WRITES= 1U << 30;
+ /** Flags in n_pending that indicate that the tablespace is being
+ deleted, and no further operations should be performed */
+ static constexpr uint32_t STOPPING= STOPPING_READS | STOPPING_WRITES;
+ /** Flag in n_pending that indicates that the tablespace is a candidate
+ for being closed, and fil_node_t::is_open() can only be trusted after
+ acquiring fil_system.mutex and resetting the flag */
+ static constexpr uint32_t CLOSING= 1U << 29;
+ /** Flag in n_pending that indicates that the tablespace needs fsync().
+ This must be the least significant flag bit; @see release_flush() */
+ static constexpr uint32_t NEEDS_FSYNC= 1U << 28;
+ /** The reference count */
+ static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
+ /** latch protecting all page allocation bitmap pages */
+ srw_lock latch;
+ pthread_t latch_owner;
+ ut_d(Atomic_relaxed<uint32_t> latch_count;)
+public:
+ /** MariaDB encryption data */
+ fil_space_crypt_t *crypt_data;
+
+ /** Whether needs_flush(), or this is in fil_system.unflushed_spaces */
+ bool is_in_unflushed_spaces;
+
+ /** Whether this in fil_system.default_encrypt_tables (needs key rotation) */
+ bool is_in_default_encrypt;
+
+private:
+ /** Whether any corrupton of this tablespace has been reported */
+ mutable std::atomic_flag is_corrupted;
+
+public:
+ /** mutex to protect freed_ranges and last_freed_lsn */
+ std::mutex freed_range_mutex;
+private:
+ /** Ranges of freed page numbers; protected by freed_range_mutex */
+ range_set freed_ranges;
+
+ /** LSN of freeing last page; protected by freed_range_mutex */
+ lsn_t last_freed_lsn;
+
+public:
+ /** @return whether doublewrite buffering is needed */
+ inline bool use_doublewrite() const;
+
+ /** @return whether a page has been freed */
+ inline bool is_freed(uint32_t page);
+
+ /** Apply freed_ranges to the file.
+ @param writable whether the file is writable
+ @return number of pages written or hole-punched */
+ uint32_t flush_freed(bool writable);
+
+ /** Append a file to the chain of files of a space.
+ @param[in] name file name of a file that is not open
+ @param[in] handle file handle, or OS_FILE_CLOSED
+ @param[in] size file size in entire database pages
+ @param[in] is_raw whether this is a raw device
+ @param[in] atomic_write true if atomic write could be enabled
+ @param[in] max_pages maximum number of pages in file,
+ or UINT32_MAX for unlimited
+ @return file object */
+ fil_node_t* add(const char* name, pfs_os_file_t handle,
+ uint32_t size, bool is_raw, bool atomic_write,
+ uint32_t max_pages = UINT32_MAX);
+#ifdef UNIV_DEBUG
+ /** Assert that the mini-transaction is compatible with
+ updating an allocation bitmap page.
+ @param[in] mtr mini-transaction */
+ void modify_check(const mtr_t& mtr) const;
+#endif /* UNIV_DEBUG */
+
+ /** Try to reserve free extents.
+ @param[in] n_free_now current number of free extents
+ @param[in] n_to_reserve number of extents to reserve
+ @return whether the reservation succeeded */
+ bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
+ {
+ if (n_reserved_extents + n_to_reserve > n_free_now) {
+ return false;
+ }
+
+ n_reserved_extents += n_to_reserve;
+ return true;
+ }
+
+ /** Release the reserved free extents.
+ @param[in] n_reserved number of reserved extents */
+ void release_free_extents(uint32_t n_reserved)
+ {
+ if (!n_reserved) return;
+ ut_a(n_reserved_extents >= n_reserved);
+ n_reserved_extents -= n_reserved;
+ }
+
+ /** Rename a file.
+ @param[in] path tablespace file name after renaming
+ @param[in] log whether to write redo log
+ @param[in] replace whether to ignore the existence of path
+ @return error code
+ @retval DB_SUCCESS on success */
+ dberr_t rename(const char *path, bool log, bool replace= false)
+ MY_ATTRIBUTE((nonnull));
+
+ /** Note that the tablespace has been imported.
+ Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
+ written while the space ID is being updated in each page. */
+ inline void set_imported();
+
+ /** Report the tablespace as corrupted */
+ ATTRIBUTE_COLD void set_corrupted() const;
+
+ /** @return whether the storage device is rotational (HDD, not SSD) */
+ inline bool is_rotational() const;
+
+ /** Open each file. Never invoked on .ibd files.
+ @param create_new_db whether to skip the call to fil_node_t::read_page0()
+ @return whether all files were opened */
+ bool open(bool create_new_db);
+ /** Close each file. Only invoked on fil_system.temp_space. */
+ void close();
+
+ /** Note that operations on the tablespace must stop. */
+ inline void set_stopping();
+
+ /** Note that operations on the tablespace can resume after truncation */
+ inline void clear_stopping();
+
+ /** Drop the tablespace and wait for any pending operations to cease
+ @param id tablespace identifier
+ @param detached_handle pointer to file to be closed later, or nullptr
+ @return tablespace to invoke fil_space_free() on
+ @retval nullptr if no tablespace was found, or it was deleted by
+ another concurrent thread */
+ static fil_space_t *drop(uint32_t id, pfs_os_file_t *detached_handle);
+
+private:
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Try to acquire a tablespace reference (increment referenced()).
+ @param avoid when these flags are set, nothing will be acquired
+ @return the old reference count */
+ uint32_t acquire_low(uint32_t avoid= STOPPING)
+ {
+ uint32_t n= 0;
+ while (!n_pending.compare_exchange_strong(n, n + 1,
+ std::memory_order_acquire,
+ std::memory_order_relaxed) &&
+ !(n & avoid));
+ return n;
+ }
+public:
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Acquire a tablespace reference.
+ @return whether a tablespace reference was successfully acquired */
+ inline bool acquire_if_not_stopped();
+
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Acquire a tablespace reference for I/O.
+ @param avoid when these flags are set, nothing will be acquired
+ @return whether the file is usable */
+ bool acquire(uint32_t avoid= STOPPING | CLOSING)
+ {
+ const auto flags= acquire_low(avoid) & (avoid);
+ return UNIV_LIKELY(!flags) || (flags == CLOSING && acquire_and_prepare());
+ }
+
+ /** Acquire a tablespace reference for writing.
+ @param avoid when these flags are set, nothing will be acquired
+ @return whether the file is writable */
+ bool acquire_for_write() { return acquire(STOPPING_WRITES | CLOSING); }
+
+ /** Acquire another tablespace reference for I/O. */
+ inline void reacquire();
+
+ /** Release a tablespace reference.
+ @return whether this was the last reference */
+ bool release()
+ {
+ uint32_t n= n_pending.fetch_sub(1, std::memory_order_release);
+ ut_ad(n & PENDING);
+ return (n & PENDING) == 1;
+ }
+
+ /** Clear the NEEDS_FSYNC flag */
+ void clear_flush()
+ {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
+ __asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
+ _interlockedbittestandreset(reinterpret_cast<volatile long*>
+ (&n_pending), 28);
+#else
+ n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
+#endif
+ }
+
+private:
+ /** Clear the CLOSING flag */
+ void clear_closing()
+ {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(CLOSING == 1U << 29, "compatibility");
+ __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(CLOSING == 1U << 29, "compatibility");
+ _interlockedbittestandreset(reinterpret_cast<volatile long*>
+ (&n_pending), 29);
+#else
+ n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+#endif
+ }
+
+ /** @return pending operations (and flags) */
+ uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
+public:
+ /** @return whether close() of the file handle has been requested */
+ bool is_closing() const { return pending() & CLOSING; }
+ /** @return whether the tablespace is about to be dropped */
+ bool is_stopping() const { return pending() & STOPPING; }
+ /** @return whether the tablespace is going to be dropped */
+ bool is_stopping_writes() const { return pending() & STOPPING_WRITES; }
+ /** @return number of pending operations */
+ bool is_ready_to_close() const
+ { return (pending() & (PENDING | CLOSING)) == CLOSING; }
+ /** @return whether fsync() or similar is needed */
+ bool needs_flush() const { return pending() & NEEDS_FSYNC; }
+ /** @return whether fsync() or similar is needed, and the tablespace is
+ not being dropped */
+ bool needs_flush_not_stopping() const
+ { return (pending() & (NEEDS_FSYNC | STOPPING_WRITES)) == NEEDS_FSYNC; }
+
+ uint32_t referenced() const { return pending() & PENDING; }
+private:
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Prepare to close the file handle.
+ @return number of pending operations, possibly with NEEDS_FSYNC flag */
+ uint32_t set_closing()
+ {
+ return n_pending.fetch_or(CLOSING, std::memory_order_acquire);
+ }
+
+public:
+ /** Try to close a file to adhere to the innodb_open_files limit.
+ @param print_info whether to diagnose why a file cannot be closed
+ @return whether a file was closed */
+ static bool try_to_close(bool print_info);
+
+ /** Close all tablespace files at shutdown */
+ static void close_all();
+
+ /** Update last_freed_lsn */
+ void update_last_freed_lsn(lsn_t lsn) { last_freed_lsn= lsn; }
+
+ /** Note that the file will need fsync().
+ @return whether this needs to be added to fil_system.unflushed_spaces */
+ bool set_needs_flush()
+ {
+ uint32_t n= 1;
+ while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ {
+ ut_ad(n & PENDING);
+ if (n & (NEEDS_FSYNC | STOPPING_WRITES))
+ return false;
+ }
+
+ return true;
+ }
+
+ /** Clear all freed ranges for undo tablespace when InnoDB
+ encounters TRIM redo log record */
+ void clear_freed_ranges() { freed_ranges.clear(); }
+#endif /* !UNIV_INNOCHECKSUM */
+ /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags;
+ check fsp0types.h to more info about flags. */
+ uint32_t flags;
+
+ /** Determine if full_crc32 is used for a data file
+ @param[in] flags tablespace flags (FSP_SPACE_FLAGS)
+ @return whether the full_crc32 algorithm is active */
+ static bool full_crc32(uint32_t flags)
+ { return flags & FSP_FLAGS_FCRC32_MASK_MARKER; }
+ /** @return whether innodb_checksum_algorithm=full_crc32 is active */
+ bool full_crc32() const { return full_crc32(flags); }
+ /** Determine if full_crc32 is used along with PAGE_COMPRESSED */
+ static bool is_full_crc32_compressed(uint32_t flags)
+ {
+ if (!full_crc32(flags))
+ return false;
+ auto algo= FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags);
+ DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST);
+ return algo != 0;
+ }
+ /** Determine the logical page size.
+ @param flags tablespace flags (FSP_SPACE_FLAGS)
+ @return the logical page size
+ @retval 0 if the flags are invalid */
+ static unsigned logical_size(uint32_t flags)
+ {
+ switch (full_crc32(flags)
+ ? FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags)
+ : FSP_FLAGS_GET_PAGE_SSIZE(flags)) {
+ case 3: return 4096;
+ case 4: return 8192;
+ case 5: return full_crc32(flags) ? 16384 : 0;
+ case 0: return full_crc32(flags) ? 0 : 16384;
+ case 6: return 32768;
+ case 7: return 65536;
+ default: return 0;
+ }
+ }
+ /** Determine the ROW_FORMAT=COMPRESSED page size.
+ @param flags tablespace flags (FSP_SPACE_FLAGS)
+ @return the ROW_FORMAT=COMPRESSED page size
+ @retval 0 if ROW_FORMAT=COMPRESSED is not used */
+ static unsigned zip_size(uint32_t flags)
+ {
+ if (full_crc32(flags))
+ return 0;
+ const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ return zip_ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0;
+ }
+ /** Determine the physical page size.
+ @param flags tablespace flags (FSP_SPACE_FLAGS)
+ @return the physical page size */
+ static unsigned physical_size(uint32_t flags)
+ {
+ if (full_crc32(flags))
+ return logical_size(flags);
+
+ const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ return zip_ssize
+ ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+ : unsigned(srv_page_size);
+ }
+
+ /** @return the ROW_FORMAT=COMPRESSED page size
+ @retval 0 if ROW_FORMAT=COMPRESSED is not used */
+ unsigned zip_size() const { return zip_size(flags); }
+ /** @return the physical page size */
+ unsigned physical_size() const { return physical_size(flags); }
+
+ /** Check whether PAGE_COMPRESSED is enabled.
+ @param[in] flags tablespace flags */
+ static bool is_compressed(uint32_t flags)
+ {
+ return is_full_crc32_compressed(flags) ||
+ FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
+ }
+ /** @return whether the compression enabled for the tablespace. */
+ bool is_compressed() const { return is_compressed(flags); }
+
+ /** Get the compression algorithm for full crc32 format.
+ @param flags contents of FSP_SPACE_FLAGS
+ @return PAGE_COMPRESSED algorithm of full_crc32 tablespace
+ @retval 0 if not PAGE_COMPRESSED or not full_crc32 */
+ static unsigned get_compression_algo(uint32_t flags)
+ {
+ return full_crc32(flags)
+ ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags)
+ : 0;
+ }
+ /** @return the page_compressed algorithm
+ @retval 0 if not page_compressed */
+ unsigned get_compression_algo() const { return get_compression_algo(flags); }
+ /** Determine if the page_compressed page contains an extra byte
+ for exact compressed stream length
+ @param flags contents of FSP_SPACE_FLAGS
+ @return whether the extra byte is needed */
+ static bool full_crc32_page_compressed_len(uint32_t flags)
+ {
+ DBUG_ASSERT(full_crc32(flags));
+ switch (get_compression_algo(flags)) {
+ case PAGE_LZ4_ALGORITHM:
+ case PAGE_LZO_ALGORITHM:
+ case PAGE_SNAPPY_ALGORITHM:
+ return true;
+ }
+ return false;
+ }
+
+ /** Whether the full checksum matches with non full checksum flags.
+ @param flags contents of FSP_SPACE_FLAGS
+ @param expected expected flags
+ @return true if it is equivalent */
+ static bool is_flags_full_crc32_equal(uint32_t flags, uint32_t expected)
+ {
+ ut_ad(full_crc32(flags));
+ uint32_t fcrc32_psize= FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags);
+
+ if (full_crc32(expected))
+ /* The data file may have been created with a
+ different innodb_compression_algorithm. But
+ we only support one innodb_page_size for all files. */
+ return fcrc32_psize == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+
+ uint32_t non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(expected);
+ if (!non_fcrc32_psize)
+ return fcrc32_psize == 5;
+ return fcrc32_psize == non_fcrc32_psize;
+ }
+
+ /** Whether old tablespace flags match full_crc32 flags.
+ @param flags contents of FSP_SPACE_FLAGS
+ @param expected expected flags
+ @return true if it is equivalent */
+ static bool is_flags_non_full_crc32_equal(uint32_t flags, uint32_t expected)
+ {
+ ut_ad(!full_crc32(flags));
+ if (!full_crc32(expected))
+ return false;
+
+ uint32_t non_fcrc32_psize= FSP_FLAGS_GET_PAGE_SSIZE(flags);
+ uint32_t fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+
+ if (!non_fcrc32_psize)
+ return fcrc32_psize == 5;
+ return fcrc32_psize == non_fcrc32_psize;
+ }
+
+ /** Whether both fsp flags are equivalent */
+ static bool is_flags_equal(uint32_t flags, uint32_t expected)
+ {
+ if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED)))
+ return true;
+ return full_crc32(flags)
+ ? is_flags_full_crc32_equal(flags, expected)
+ : is_flags_non_full_crc32_equal(flags, expected);
+ }
+
+ /** Validate the tablespace flags for full crc32 format.
+ @param flags contents of FSP_SPACE_FLAGS
+ @return whether the flags are correct in full crc32 format */
+ static bool is_fcrc32_valid_flags(uint32_t flags)
+ {
+ ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER);
+ const ulint page_ssize= physical_size(flags);
+ if (page_ssize < 3 || page_ssize & 8)
+ return false;
+ flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+ return flags <= PAGE_ALGORITHM_LAST;
+ }
+ /** Validate the tablespace flags.
+ @param flags contents of FSP_SPACE_FLAGS
+ @param is_ibd whether this is an .ibd file (not system tablespace)
+ @return whether the flags are correct */
+ static bool is_valid_flags(uint32_t flags, bool is_ibd)
+ {
+ DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return false;);
+ if (full_crc32(flags))
+ return is_fcrc32_valid_flags(flags);
+
+ if (flags == 0)
+ return true;
+ if (~FSP_FLAGS_MASK & flags)
+ return false;
+
+ if (FSP_FLAGS_MASK_ATOMIC_BLOBS ==
+ (flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)))
+ /* If the "atomic blobs" flag (indicating
+ ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag is set, then the
+ ROW_FORMAT!=REDUNDANT flag must also be set. */
+ return false;
+
+ /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag
+ of MySQL 5.6 and MariaDB 10.0, which we ignore.
+ In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20,
+ bits 10..14 would be nonzero 0bsssaa where sss is
+ nonzero PAGE_SSIZE (3, 4, 6, or 7)
+ and aa is ATOMIC_WRITES (not 0b11). */
+ if (FSP_FLAGS_GET_RESERVED(flags) & ~1U)
+ return false;
+
+ const uint32_t ssize= FSP_FLAGS_GET_PAGE_SSIZE(flags);
+ if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8)
+ /* the page_size is not between 4k and 64k;
+ 16k should be encoded as 0, not 5 */
+ return false;
+
+ const uint32_t zssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ if (zssize == 0)
+ /* not ROW_FORMAT=COMPRESSED */;
+ else if (zssize > (ssize ? ssize : 5))
+ /* Invalid KEY_BLOCK_SIZE */
+ return false;
+ else if (~flags &
+ (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+ /* both these flags must set for ROW_FORMAT=COMPRESSED */
+ return false;
+
+ /* The flags do look valid. But, avoid misinterpreting
+ buggy MariaDB 10.1 format flags for
+ PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3}
+ as valid-looking PAGE_SSIZE if this is known to be
+ an .ibd file and we are using the default innodb_page_size=16k. */
+ return(ssize == 0 || !is_ibd || srv_page_size != UNIV_PAGE_SIZE_ORIG);
+ }
+
+#ifndef UNIV_INNOCHECKSUM
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Create a tablespace in fil_system.
+ @param id tablespace identifier
+ @param flags tablespace flags
+ @param purpose tablespace purpose
+ @param crypt_data encryption information
+ @param mode encryption mode
+ @param opened true if space files are opened
+ @return pointer to created tablespace, to be filled in with add()
+ @retval nullptr on failure (such as when the same tablespace exists) */
+ static fil_space_t *create(uint32_t id, uint32_t flags,
+ fil_type_t purpose, fil_space_crypt_t *crypt_data,
+ fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT,
+ bool opened= false);
+
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Acquire a tablespace reference.
+ @param id tablespace identifier
+ @return tablespace
+ @retval nullptr if the tablespace is missing or inaccessible */
+ static fil_space_t *get(uint32_t id);
+ /** Acquire a tablespace reference for writing.
+ @param id tablespace identifier
+ @return tablespace
+ @retval nullptr if the tablespace is missing or inaccessible */
+ static fil_space_t *get_for_write(uint32_t id);
+
+ /** Add/remove the free page in the freed ranges list.
+ @param[in] offset page number to be added
+ @param[in] free true if page to be freed */
+ void free_page(uint32_t offset, bool add=true)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ if (add)
+ return freed_ranges.add_value(offset);
+
+ if (freed_ranges.empty())
+ return;
+
+ return freed_ranges.remove_value(offset);
+ }
+
+ /** Add the range of freed pages */
+ void add_free_ranges(range_set ranges)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges= std::move(ranges);
+ }
+
+ /** Add the set of freed page ranges */
+ void add_free_range(const range_t range)
+ {
+ freed_ranges.add_range(range);
+ }
+
+ /** Set the tablespace size in pages */
+ void set_sizes(uint32_t s)
+ {
+ ut_ad(id ? !size : (size >= s));
+ size= s; committed_size= s;
+ }
+
+ /** Update committed_size in mtr_t::commit() */
+ void set_committed_size() { committed_size= size; }
+
+ /** @return the last persisted page number */
+ uint32_t last_page_number() const { return committed_size - 1; }
+
+ /** @return the size in pages (0 if unreadable) */
+ inline uint32_t get_size();
+
+ /** Read or write data.
+ @param type I/O context
+ @param offset offset in bytes
+ @param len number of bytes
+ @param buf the data to be read or written
+ @param bpage buffer block (for type.is_async() completion callback)
+ @return status and file descriptor */
+ fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
+ void *buf, buf_page_t *bpage= nullptr);
+ /** Flush pending writes from the file system cache to the file. */
+ template<bool have_reference> inline void flush();
+ /** Flush pending writes from the file system cache to the file. */
+ void flush_low();
+
+ /** Read the first page of a data file.
+ @return whether the page was found valid */
+ bool read_page0();
+
+ /** Determine the next tablespace for encryption key rotation.
+ @param space current tablespace (nullptr to start from the beginning)
+ @param recheck whether the removal condition needs to be rechecked after
+ encryption parameters were changed
+ @param encrypt expected state of innodb_encrypt_tables
+ @return the next tablespace
+ @retval nullptr upon reaching the end of the iteration */
+ static space_list_t::iterator next(space_list_t::iterator space,
+ bool recheck, bool encrypt);
+
+#ifdef UNIV_DEBUG
+ bool is_latched() const { return latch_count != 0; }
+#endif
+ bool is_owner() const { return latch_owner == pthread_self(); }
+ /** Acquire the allocation latch in exclusive mode */
+ void x_lock()
+ {
+ latch.wr_lock(SRW_LOCK_CALL);
+ ut_ad(!latch_owner);
+ latch_owner= pthread_self();
+ ut_ad(!latch_count.fetch_add(1));
+ }
+ /** Release the allocation latch from exclusive mode */
+ void x_unlock()
+ {
+ ut_ad(latch_count.fetch_sub(1) == 1);
+ ut_ad(latch_owner == pthread_self());
+ latch_owner= 0;
+ latch.wr_unlock();
+ }
+ /** Acquire the allocation latch in shared mode */
+ void s_lock()
+ {
+ ut_ad(!is_owner());
+ latch.rd_lock(SRW_LOCK_CALL);
+ ut_ad(!latch_owner);
+ ut_d(latch_count.fetch_add(1));
+ }
+ /** Release the allocation latch from shared mode */
+ void s_unlock()
+ {
+ ut_ad(latch_count.fetch_sub(1));
+ ut_ad(!latch_owner);
+ latch.rd_unlock();
+ }
+
+ typedef span<const char> name_type;
+
+ /** @return the tablespace name (databasename/tablename) */
+ name_type name() const;
+
+private:
+ /** @return whether the file is usable for io() */
+ ATTRIBUTE_COLD bool prepare_acquired();
+ /** @return whether the file is usable for io() */
+ ATTRIBUTE_COLD bool acquire_and_prepare();
+#endif /*!UNIV_INNOCHECKSUM */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+/** File node of a tablespace or the log data space */
+struct fil_node_t final
+{
+ /** tablespace containing this file */
+ fil_space_t *space;
+ /** file name; protected by fil_system.mutex and exclusive log_sys.latch */
+ char *name;
+ /** file handle */
+ pfs_os_file_t handle;
+ /** whether the file is on non-rotational media (SSD) */
+ unsigned on_ssd:1;
+ /** how to write page_compressed tables
+ (0=do not punch holes but write minimal amount of data, 1=punch holes,
+ 2=always write the same amount; thinly provisioned storage will compress) */
+ unsigned punch_hole:2;
+ /** whether this file could use atomic write */
+ unsigned atomic_write:1;
+ /** whether the file actually is a raw device or disk partition */
+ unsigned is_raw_disk:1;
+ /** whether the tablespace discovery is being deferred during crash
+ recovery due to incompletely written page 0 */
+ unsigned deferred:1;
+
+ /** size of the file in database pages (0 if not known yet);
+ the possible last incomplete megabyte may be ignored if space->id == 0 */
+ uint32_t size;
+ /** initial size of the file in database pages;
+ FIL_IBD_FILE_INITIAL_SIZE by default */
+ uint32_t init_size;
+ /** maximum size of the file in database pages (0 if unlimited) */
+ uint32_t max_size;
+ /** whether the file is currently being extended */
+ Atomic_relaxed<bool> being_extended;
+ /** link to other files in this tablespace */
+ UT_LIST_NODE_T(fil_node_t) chain;
+
+ /** Filesystem block size */
+ ulint block_size;
+
+ /** @return whether this file is open */
+ bool is_open() const { return handle != OS_FILE_CLOSED; }
+
+ /** Read the first page of a data file.
+ @return whether the page was found valid */
+ bool read_page0();
+
+ /** Determine some file metadata when creating or reading the file.
+ @param file the file that is being created, or OS_FILE_CLOSED */
+ void find_metadata(os_file_t file= OS_FILE_CLOSED
+#ifndef _WIN32
+ , bool create= false, struct stat *statbuf= nullptr
+#endif
+ );
+
+ /** Close the file handle. */
+ void close();
+ /** Same as close() but returns file handle instead of closing it. */
+ pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result));
+ /** Prepare to free a file from fil_system.
+ @param detach_handle whether to detach instead of closing a handle
+ @return detached handle or OS_FILE_CLOSED */
+ inline pfs_os_file_t close_to_free(bool detach_handle= false);
+
+ /** Update the data structures on write completion */
+ inline void complete_write();
+
+private:
+ /** Does stuff common for close() and detach() */
+ void prepare_to_close_or_detach();
+};
+
+inline bool fil_space_t::use_doublewrite() const
+{
+ return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
+ buf_dblwr.is_created();
+}
+
+inline void fil_space_t::set_imported()
+{
+ ut_ad(purpose == FIL_TYPE_IMPORT);
+ purpose= FIL_TYPE_TABLESPACE;
+ UT_LIST_GET_FIRST(chain)->find_metadata();
+}
+
+inline bool fil_space_t::is_rotational() const
+{
+ for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ if (!node->on_ssd)
+ return true;
+ return false;
+}
+
+/** Common InnoDB file extensions */
+enum ib_extention {
+ NO_EXT = 0,
+ IBD = 1,
+ ISL = 2,
+ CFG = 3
+};
+extern const char* dot_ext[];
+#define DOT_IBD dot_ext[IBD]
+#define DOT_ISL dot_ext[ISL]
+#define DOT_CFG dot_ext[CFG]
+
+/** When mariadbd is run, the default directory "." is the mysqld datadir,
+but in the MariaDB Embedded Server Library and mysqlbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char* fil_path_to_mysql_datadir;
+#else
+# include "univ.i"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE 4U
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define FIL_NULL ULINT32_UNDEFINED
+
+
+#define FIL_ADDR_PAGE 0U /* first in address is the page offset */
+#define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/
+#define FIL_ADDR_SIZE 6U /* address size is 6 bytes */
+
+/** File space address */
+struct fil_addr_t {
+ /** page number within a tablespace */
+ uint32_t page;
+ /** byte offset within the page */
+ uint16_t boffset;
+};
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the
+ page belongs to (== 0) but in later
+ versions the 'new' checksum of the
+ page */
+#define FIL_PAGE_OFFSET 4U /*!< page offset inside space */
+#define FIL_PAGE_PREV 8U /*!< if there is a 'natural'
+ predecessor of the page, its
+ offset. Otherwise FIL_NULL.
+ This field is not set on BLOB
+ pages, which are stored as a
+ singly-linked list. See also
+ FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT 12U /*!< if there is a 'natural' successor
+ of the page, its offset.
+ Otherwise FIL_NULL.
+ B-tree index pages
+ (FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+ on the same PAGE_LEVEL are maintained
+ as a doubly linked list via
+ FIL_PAGE_PREV and FIL_PAGE_NEXT
+ in the collation order of the
+ smallest user record on each page. */
+#define FIL_PAGE_LSN 16U /*!< lsn of the end of the newest
+ modification log record to the page */
+#define FIL_PAGE_TYPE 24U /*!< file page type: FIL_PAGE_INDEX,...,
+ 2 bytes.
+
+ The contents of this field can only
+ be trusted in the following case:
+ if the page is an uncompressed
+ B-tree index page, then it is
+ guaranteed that the value is
+ FIL_PAGE_INDEX.
+ The opposite does not hold.
+
+ In tablespaces created by
+ MySQL/InnoDB 5.1.7 or later, the
+ contents of this field is valid
+ for all uncompressed pages. */
+
+/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
+the file has been flushed to disk at least up to this lsn
+For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
+format: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bits of zero if no encryption */
+#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
+
+/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
+#define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** Start of the page_compressed content */
+#define FIL_PAGE_COMP_ALGO FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** starting from 4.1.x this contains the space id of the page */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U
+
+#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+#define FIL_PAGE_DATA 38U /*!< start of the data on the page */
+
+/** 32-bit key version used to encrypt the page in full_crc32 format.
+For non-encrypted page, it contains 0. */
+#define FIL_PAGE_FCRC32_KEY_VERSION 0
+
+/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */
+/** Number of bytes used to store actual payload data size on
+page_compressed pages when not using full_crc32. */
+#define FIL_PAGE_COMP_SIZE 0
+
+/** Number of bytes for FIL_PAGE_COMP_SIZE */
+#define FIL_PAGE_COMP_METADATA_LEN 2
+
+/** Number of bytes used to store actual compression method
+for encrypted tables when not using full_crc32. */
+#define FIL_PAGE_ENCRYPT_COMP_ALGO 2
+
+/** Extra header size for encrypted page_compressed pages when
+not using full_crc32 */
+#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN 4
+/* @} */
+
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used
+ to store the page checksum, the
+ last 4 bytes should be identical
+ to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */
+
+/** Store the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_FCRC32_END_LSN 8
+
+/** Store crc32 checksum at the end of the page */
+#define FIL_PAGE_FCRC32_CHECKSUM 4
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+/** page_compressed, encrypted=YES (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401;
+/** page_compressed (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354;
+/** B-tree index page */
+constexpr uint16_t FIL_PAGE_INDEX= 17855;
+/** R-tree index page (SPATIAL INDEX) */
+constexpr uint16_t FIL_PAGE_RTREE= 17854;
+/** Undo log page */
+constexpr uint16_t FIL_PAGE_UNDO_LOG= 2;
+/** Index node (of file-in-file metadata) */
+constexpr uint16_t FIL_PAGE_INODE= 3;
+/** Insert buffer free list */
+constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4;
+/** Freshly allocated page */
+constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0;
+/** Change buffer bitmap (pages n*innodb_page_size+1) */
+constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5;
+/** System page */
+constexpr uint16_t FIL_PAGE_TYPE_SYS= 6;
+/** Transaction system data */
+constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7;
+/** Tablespace header (page 0) */
+constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8;
+/** Extent descriptor page (pages n*innodb_page_size, except 0) */
+constexpr uint16_t FIL_PAGE_TYPE_XDES= 9;
+/** Uncompressed BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10;
+/** First ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11;
+/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12;
+/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this
+value when flushing pages. */
+constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13;
+
+/* File page types introduced in MySQL 5.7, not supported in MariaDB */
+//constexpr uint16_t FIL_PAGE_COMPRESSED = 14;
+//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15;
+//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16;
+//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17;
+/** Clustered index root page after instant ADD COLUMN */
+constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18;
+
+/** Used by i_s.cc to index into the text description.
+Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */
+constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN;
+
+/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format.
+If the flag is set, then the following holds for the remaining bits
+of FIL_PAGE_TYPE:
+Bits 0..7 will contain the compressed page size in bytes.
+Bits 8..14 are reserved and must be 0. */
+constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15;
+/* @} */
+
+/** @return whether the page type is B-tree or R-tree index */
+inline bool fil_page_type_is_index(uint16_t page_type)
+{
+ switch (page_type) {
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ return(true);
+ }
+ return(false);
+}
+
+/** Check whether the page is index page (either regular Btree index or Rtree
+index */
+#define fil_page_index_page_check(page) \
+ fil_page_type_is_index(fil_page_get_type(page))
+
+/** Get the file page type.
+@param[in] page file page
+@return page type */
+inline uint16_t fil_page_get_type(const byte *page)
+{
+ return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Number of pending tablespace flushes */
+extern Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in] id tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t *fil_space_get(uint32_t id)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** The tablespace memory cache */
+struct fil_system_t
+{
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+ fil_system_t() : m_initialised(false) {}
+
+ bool is_initialised() const { return m_initialised; }
+
+ /**
+ Create the file system interface at database start.
+
+ @param[in] hash_size hash table size
+ */
+ void create(ulint hash_size);
+
+ /** Close the file system interface at shutdown */
+ void close();
+
+private:
+ bool m_initialised;
+
+ /** Points to the last opened space in space_list. Protected with
+ fil_system.mutex. */
+ fil_space_t *space_list_last_opened= nullptr;
+
+#ifdef __linux__
+ /** available block devices that reside on non-rotational storage */
+ std::vector<dev_t> ssd;
+public:
+ /** @return whether a file system device is on non-rotational storage */
+ bool is_ssd(dev_t dev) const
+ {
+ /* Linux seems to allow up to 15 partitions per block device.
+ If the detected ssd carries "partition number 0" (it is the whole device),
+ compare the candidate file system number without the partition number. */
+ for (const auto s : ssd)
+ if (dev == s || (dev & ~15U) == s)
+ return true;
+ return false;
+ }
+#endif
+public:
+ /** Detach a tablespace from the cache and close the files.
+ @param space tablespace
+ @param detach_handle whether to detach the handle, instead of closing
+ @return detached handle
+ @retval OS_FILE_CLOSED if no handle was detached */
+ pfs_os_file_t detach(fil_space_t *space, bool detach_handle= false);
+
+ /** the mutex protecting most data fields, and some fields of fil_space_t */
+ mysql_mutex_t mutex;
+ fil_space_t* sys_space; /*!< The innodb_system tablespace */
+ fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
+ /** Map of fil_space_t::id to fil_space_t* */
+ hash_table_t spaces;
+ /** tablespaces for which fil_space_t::needs_flush() holds */
+ sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
+ /** number of currently open files; protected by mutex */
+ ulint n_open;
+ /** last time we noted n_open exceeding the limit; protected by mutex */
+ time_t n_open_exceeded_time;
+ /** maximum persistent tablespace id that has ever been assigned */
+ uint32_t max_assigned_id;
+ /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
+ to the end of space_list, for FIFO policy of try_to_close() */
+ ulint freeze_space_list;
+ /** List of all file spaces, opened spaces should be at the top of the list
+ to optimize try_to_close() execution. Protected with fil_system.mutex. */
+ ilist<fil_space_t, space_list_tag_t> space_list;
+ /** list of all tablespaces for which a FILE_MODIFY record has been written
+ since the latest redo log checkpoint.
+ Protected only by exclusive log_sys.latch. */
+ ilist<fil_space_t, named_spaces_tag_t> named_spaces;
+
+ /** list of all ENCRYPTED=DEFAULT tablespaces that need
+ to be converted to the current value of innodb_encrypt_tables */
+ ilist<fil_space_t, default_encrypt_tag_t> default_encrypt_tables;
+
+ /** whether fil_space_t::create() has issued a warning about
+ potential space_id reuse */
+ bool space_id_reuse_warned;
+
+ /** Add the file to the end of opened spaces list in
+ fil_system.space_list, so that fil_space_t::try_to_close() should close
+ it as a last resort.
+ @param space space to add */
+ void add_opened_last_to_space_list(fil_space_t *space);
+
+ /** Move the file to the end of opened spaces list in
+ fil_system.space_list, so that fil_space_t::try_to_close() should close
+ it as a last resort.
+ @param space space to move */
+ inline void move_opened_last_to_space_list(fil_space_t *space)
+ {
+ /* In the case when several files of the same space are added in a
+ row, there is no need to remove and add a space to the same position
+ in space_list. It can be for system or temporary tablespaces. */
+ if (freeze_space_list || space_list_last_opened == space)
+ return;
+
+ space_list.erase(space_list_t::iterator(space));
+ add_opened_last_to_space_list(space);
+ }
+
+ /** Move closed file last in fil_system.space_list, so that
+ fil_space_t::try_to_close() iterates opened files first in FIFO order,
+ i.e. first opened, first closed.
+ @param space space to move */
+ void move_closed_last_to_space_list(fil_space_t *space)
+ {
+ if (UNIV_UNLIKELY(freeze_space_list))
+ return;
+
+ space_list_t::iterator s= space_list_t::iterator(space);
+
+ if (space_list_last_opened == space)
+ {
+ ut_ad(s != space_list.begin());
+ space_list_t::iterator prev= s;
+ space_list_last_opened= &*--prev;
+ }
+
+ space_list.erase(s);
+ space_list.push_back(*space);
+ }
+
+ /** Return the next tablespace from default_encrypt_tables list.
+ @param space previous tablespace (nullptr to start from the start)
+ @param recheck whether the removal condition needs to be rechecked after
+ the encryption parameters were changed
+ @param encrypt expected state of innodb_encrypt_tables
+ @return the next tablespace to process (n_pending_ops incremented)
+ @retval fil_system.temp_space if there is no work to do
+ @retval nullptr upon reaching the end of the iteration */
+ inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck,
+ bool encrypt);
+
+ /** Extend all open data files to the recovered size */
+ ATTRIBUTE_COLD void extend_to_recv_size();
+
+ /** Determine if a tablespace associated with a file name exists.
+ @param path tablespace file name to look for
+ @return a matching tablespace */
+ inline fil_space_t *find(const char *path) const;
+};
+
+/** The tablespace memory cache. */
+extern fil_system_t fil_system;
+
+inline void fil_space_t::reacquire()
+{
+ ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
+#ifdef SAFE_MUTEX
+ if (mysql_mutex_is_owner(&fil_system.mutex)) return;
+ ut_ad(n & PENDING);
+ ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+#endif /* SAFE_MUTEX */
+}
+
+/** Note that operations on the tablespace must stop. */
+inline void fil_space_t::set_stopping()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+ __asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+ _interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 30);
+#else
+ n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed);
+#endif
+}
+
+inline void fil_space_t::clear_stopping()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+ ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
+ ut_ad((n & STOPPING) == STOPPING_WRITES);
+}
+
+/** Flush pending writes from the file system cache to the file. */
+template<bool have_reference> inline void fil_space_t::flush()
+{
+ mysql_mutex_assert_not_owner(&fil_system.mutex);
+ ut_ad(!have_reference || (pending() & PENDING));
+ ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
+ if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+ {
+ ut_ad(!is_in_unflushed_spaces);
+ ut_ad(!needs_flush());
+ }
+ else if (have_reference)
+ flush_low();
+ else
+ {
+ if (!(acquire_low(STOPPING | CLOSING) & (STOPPING | CLOSING)))
+ {
+ flush_low();
+ release();
+ }
+ }
+}
+
+/** @return the size in pages (0 if unreadable) */
+inline uint32_t fil_space_t::get_size()
+{
+ if (!size)
+ {
+ mysql_mutex_lock(&fil_system.mutex);
+ read_page0();
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+ return size;
+}
+
+#include "fil0crypt.h"
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool fil_assign_new_space_id(uint32_t *space_id);
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param id tablespace identifier
+@param x_latched whether the caller holds exclusive fil_space_t::latch
+@return true if success */
+bool fil_space_free(uint32_t id, bool x_latched);
+
+/** Set the recovered size of a tablespace in pages.
+@param id tablespace ID
+@param size recovered size in pages
+@param flags tablespace flags */
+void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size,
+ uint32_t flags);
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void fil_set_max_space_id_if_bigger(uint32_t max_id);
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Delete a tablespace and associated .ibd file.
+@param id tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(uint32_t id);
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(uint32_t id);
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
+@return own: file name */
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+ ib_extention ext, bool trim_name);
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+ ib_extention suffix, bool strip_name);
+
+/** Create a tablespace file.
+@param[in] space_id Tablespace ID
+@param[in] name Tablespace name in dbname/tablename format.
+@param[in] path Path and filename of the datafile to create.
+@param[in] flags Tablespace flags
+@param[in] size Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in] mode MariaDB encryption mode
+@param[in] key_id MariaDB encryption key_id
+@param[out] err DB_SUCCESS or error code
+@return the created tablespace
+@retval NULL on error */
+fil_space_t*
+fil_ibd_create(
+ uint32_t space_id,
+ const table_name_t name,
+ const char* path,
+ uint32_t flags,
+ uint32_t size,
+ fil_encryption_t mode,
+ uint32_t key_id,
+ dberr_t* err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out] space tablespace
+@param[in] flags desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags);
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of dict_sys.latch, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file. This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+@param[in] validate 0=maybe missing, 1=do not validate, 2=validate
+@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in] id tablespace ID
+@param[in] flags expected FSP_SPACE_FLAGS
+@param[in] name table name
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in] path_in expected filepath, usually read from dictionary
+@param[out] err DB_SUCCESS or error code
+@return tablespace
+@retval NULL if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+ unsigned validate,
+ fil_type_t purpose,
+ uint32_t id,
+ uint32_t flags,
+ fil_space_t::name_type name,
+ const char* path_in,
+ dberr_t* err = NULL)
+ MY_ATTRIBUTE((warn_unused_result));
+
+enum fil_load_status {
+ /** The tablespace file(s) were found and valid. */
+ FIL_LOAD_OK,
+ /** The name no longer matches space_id */
+ FIL_LOAD_ID_CHANGED,
+ /** The file(s) were not found */
+ FIL_LOAD_NOT_FOUND,
+ /** The file(s) were not valid */
+ FIL_LOAD_INVALID,
+ /** The tablespace file was deferred to open */
+ FIL_LOAD_DEFER
+};
+
+/** Open a single-file tablespace and add it to the InnoDB data structures.
+@param[in] space_id tablespace ID
+@param[in] filename path/to/databasename/tablename.ibd
+@param[out] space the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in] id Tablespace ID
+@param[in] table_flags table flags
+@return the tablespace
+@retval NULL if no matching tablespace exists in the memory cache */
+fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id,
+ uint32_t table_flags);
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out] space tablespace
+@param[in] size desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size);
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces();
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate();
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type); /*!< in: type */
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+void
+fil_delete_file(
+/*============*/
+ const char* path); /*!< in: filepath of the ibd tablespace */
+
+/** Look up a tablespace.
+@param tablespace identifier
+@return tablespace
+@retval nullptr if not found */
+fil_space_t *fil_space_get_by_id(uint32_t id);
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out] space tablespace */
+void
+fil_names_dirty(
+ fil_space_t* space);
+
+
+bool fil_comp_algo_loaded(ulint comp_algo);
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
+@param lsn checkpoint LSN
+@return current LSN */
+lsn_t fil_names_clear(lsn_t lsn);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+void test_make_filepath();
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+
+/** Determine the block size of the data file.
+@param[in] space tablespace
+@param[in] offset page number
+@return block size */
+ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset);
+
+/** Check whether encryption key found
+@param crypt_data Encryption data
+@param f_name File name
+@return encryption key found */
+bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name);
+
+#endif /* UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
new file mode 100644
index 00000000..2927da3c
--- /dev/null
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@param[in] encrypted whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+ulint fil_page_compress(
+ const byte* buf,
+ byte* out_buf,
+ uint32_t flags,
+ ulint block_size,
+ bool encrypted)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf compressed page buffer
+@param[in] flags tablespace flags
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
new file mode 100644
index 00000000..67e79f1a
--- /dev/null
+++ b/storage/innobase/include/fsp0file.h
@@ -0,0 +1,509 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0file.h
+Tablespace data file implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0file_h
+#define fsp0file_h
+
+#include "mem0mem.h"
+#include "os0file.h"
+#include "fil0fil.h"
+
+/** Types of raw partitions in innodb_data_file_path */
+enum device_t {
+ SRV_NOT_RAW = 0, /*!< Not a raw partition */
+ SRV_NEW_RAW, /*!< A 'newraw' partition, only to be
+ initialized */
+ SRV_OLD_RAW /*!< An initialized raw partition */
+};
+
+/** Data file control information. */
+class Datafile {
+
+ friend class Tablespace;
+ friend class SysTablespace;
+
+public:
+
+ Datafile()
+ :
+ m_filepath(),
+ m_filename(),
+ m_handle(),
+ m_open_flags(OS_FILE_OPEN),
+ m_size(),
+ m_order(),
+ m_type(SRV_NOT_RAW),
+ m_space_id(UINT32_MAX),
+ m_flags(),
+ m_exists(),
+ m_is_valid(),
+ m_first_page(),
+ m_last_os_error(),
+ m_file_info()
+ {
+ /* No op */
+ }
+
+ Datafile(uint32_t flags, uint32_t size, ulint order)
+ :
+ m_filepath(),
+ m_filename(),
+ m_handle(),
+ m_open_flags(OS_FILE_OPEN),
+ m_size(size),
+ m_order(order),
+ m_type(SRV_NOT_RAW),
+ m_space_id(UINT32_MAX),
+ m_flags(flags),
+ m_exists(),
+ m_is_valid(),
+ m_first_page(),
+ m_last_os_error(),
+ m_file_info()
+ {
+ }
+
+ Datafile(const Datafile& file)
+ :
+ m_handle(file.m_handle),
+ m_open_flags(file.m_open_flags),
+ m_size(file.m_size),
+ m_order(file.m_order),
+ m_type(file.m_type),
+ m_space_id(file.m_space_id),
+ m_flags(file.m_flags),
+ m_exists(file.m_exists),
+ m_is_valid(file.m_is_valid),
+ m_first_page(),
+ m_last_os_error(),
+ m_file_info()
+ {
+ if (file.m_filepath != NULL) {
+ m_filepath = mem_strdup(file.m_filepath);
+ ut_a(m_filepath != NULL);
+ set_filename();
+ } else {
+ m_filepath = NULL;
+ m_filename = NULL;
+ }
+ }
+
+ virtual ~Datafile()
+ {
+ shutdown();
+ }
+
+ Datafile& operator=(const Datafile& file)
+ {
+ ut_a(this != &file);
+
+ m_size = file.m_size;
+ m_order = file.m_order;
+ m_type = file.m_type;
+
+ ut_a(m_handle == OS_FILE_CLOSED);
+ m_handle = file.m_handle;
+
+ m_exists = file.m_exists;
+ m_is_valid = file.m_is_valid;
+ m_open_flags = file.m_open_flags;
+ m_space_id = file.m_space_id;
+ m_flags = file.m_flags;
+ m_last_os_error = 0;
+
+ if (m_filepath != NULL) {
+ ut_free(m_filepath);
+ m_filepath = NULL;
+ m_filename = NULL;
+ }
+
+ if (file.m_filepath != NULL) {
+ m_filepath = mem_strdup(file.m_filepath);
+ ut_a(m_filepath != NULL);
+ set_filename();
+ }
+
+ /* Do not make a copy of the first page,
+ it should be reread if needed */
+ m_first_page = NULL;
+
+ return(*this);
+ }
+
+ /** Initialize the tablespace flags */
+ void init(uint32_t flags) { m_flags= flags; }
+
+ /** Release the resources. */
+ virtual void shutdown();
+
+ /** Open a data file in read-only mode to check if it exists
+ so that it can be validated.
+ @param[in] strict whether to issue error messages
+ @return DB_SUCCESS or error code */
+ dberr_t open_read_only(bool strict);
+
+ /** Open a data file in read-write mode during start-up so that
+ doublewrite pages can be restored and then it can be validated.
+ @return DB_SUCCESS or error code */
+ inline dberr_t open_read_write()
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Initialize OS specific file info. */
+ void init_file_info();
+
+ /** Close a data file.
+ @return DB_SUCCESS or error code */
+ dberr_t close();
+
+ /** Make a full filepath from a directory path and a filename.
+ Prepend the dirpath to filename using the extension given.
+ If dirpath is NULL, prepend the default datadir to filepath.
+ Store the result in m_filepath.
+ @param dirpath directory path
+ @param name tablespace (table) name
+ @param ext filename extension */
+ void make_filepath(const char* dirpath, fil_space_t::name_type name,
+ ib_extention ext);
+
+ /** Set the filepath by duplicating the filepath sent in */
+ void set_filepath(const char* filepath);
+
+ /** Validates the datafile and checks that it conforms with
+ the expected space ID and flags. The file should exist and be
+ successfully opened in order for this function to validate it.
+ @param[in] space_id The expected tablespace ID.
+ @param[in] flags The expected tablespace flags.
+ @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+ m_is_valid is also set true on success, else false. */
+ dberr_t validate_to_dd(uint32_t space_id, uint32_t flags)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Validates this datafile for the purpose of recovery.
+ The file should exist and be successfully opened. We initially
+ open it in read-only mode because we just want to read the SpaceID.
+ However, if the first page is corrupt and needs to be restored
+ from the doublewrite buffer, we will reopen it in write mode and
+ ry to restore that page.
+ @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+ m_is_valid is also set true on success, else false. */
+ dberr_t validate_for_recovery()
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Checks the consistency of the first page of a datafile when the
+ tablespace is opened. This occurs before the fil_space_t is created
+ so the Space ID found here must not already be open.
+ m_is_valid is set true on success, else false.
+ @retval DB_SUCCESS on if the datafile is valid
+ @retval DB_CORRUPTION if the datafile is not readable
+ @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+ dberr_t validate_first_page()
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Get Datafile::m_filepath.
+ @return m_filepath */
+ const char* filepath() const
+ {
+ return(m_filepath);
+ }
+
+ /** Get Datafile::m_handle.
+ @return m_handle */
+ pfs_os_file_t handle() const
+ {
+ return(m_handle);
+ }
+
+ /** @return detached file handle */
+ pfs_os_file_t detach()
+ {
+ pfs_os_file_t detached = m_handle;
+ m_handle = OS_FILE_CLOSED;
+ return detached;
+ }
+
+ /** Get Datafile::m_order.
+ @return m_order */
+ ulint order() const
+ {
+ return(m_order);
+ }
+
+ /** Get Datafile::m_space_id.
+ @return m_space_id */
+ uint32_t space_id() const { return m_space_id; }
+
+ /** Get Datafile::m_flags.
+ @return m_flags */
+ uint32_t flags() const { return m_flags; }
+
+ /**
+ @return true if m_handle is open, false if not */
+ bool is_open() const { return m_handle != OS_FILE_CLOSED; }
+
+ /** Get Datafile::m_is_valid.
+ @return m_is_valid */
+ bool is_valid() const
+ {
+ return(m_is_valid);
+ }
+
+ /** Get the last OS error reported
+ @return m_last_os_error */
+ ulint last_os_error() const
+ {
+ return(m_last_os_error);
+ }
+
+ /** Check whether the file is empty.
+ @return true if file is empty */
+ bool is_empty_file() const
+ {
+#ifdef _WIN32
+ os_offset_t offset =
+ (os_offset_t) m_file_info.nFileSizeLow
+ | ((os_offset_t) m_file_info.nFileSizeHigh << 32);
+
+ return (offset == 0);
+#else
+ return (m_file_info.st_size == 0);
+#endif
+ }
+
+ /** Check if the file exist.
+ @return true if file exists. */
+ bool exists() const { return m_exists; }
+
+ /** Test if the filepath provided looks the same as this filepath
+ by string comparison. If they are two different paths to the same
+ file, same_as() will be used to show that after the files are opened.
+ @param[in] other filepath to compare with
+ @retval true if it is the same filename by char comparison
+ @retval false if it looks different */
+ bool same_filepath_as(const char* other) const;
+
+ /** Test if another opened datafile is the same file as this object.
+ @param[in] other Datafile to compare with
+ @return true if it is the same file, else false */
+ bool same_as(const Datafile& other) const;
+
+ /** Get access to the first data page.
+ It is valid after open_read_only() succeeded.
+ @return the first data page */
+ const byte* get_first_page() const { return(m_first_page); }
+
+ void set_space_id(uint32_t space_id) { m_space_id= space_id; }
+
+ void set_flags(uint32_t flags) { m_flags = flags; }
+private:
+ /** Free the filepath buffer. */
+ void free_filepath();
+
+ /** Set the filename pointer to the start of the file name
+ in the filepath. */
+ void set_filename()
+ {
+ if (!m_filepath) {
+ return;
+ }
+
+ if (char *last_slash = strrchr(m_filepath, '/')) {
+#if _WIN32
+ if (char *last = strrchr(m_filepath, '\\')) {
+ if (last > last_slash) {
+ last_slash = last;
+ }
+ }
+#endif
+ m_filename = last_slash + 1;
+ } else {
+ m_filename = m_filepath;
+ }
+ }
+
+ /** Create/open a data file.
+ @param[in] read_only_mode if true, then readonly mode checks
+ are enforced.
+ @return DB_SUCCESS or error code */
+ dberr_t open_or_create(bool read_only_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Reads a few significant fields from the first page of the
+ datafile, which must already be open.
+ @param[in] read_only_mode if true, then readonly mode checks
+ are enforced.
+ @return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+ dberr_t read_first_page(bool read_only_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Free the first page from memory when it is no longer needed. */
+ void free_first_page();
+
+ /** Set the Datafile::m_open_flags.
+ @param open_flags The Open flags to set. */
+ void set_open_flags(os_file_create_t open_flags)
+ {
+ m_open_flags = open_flags;
+ };
+
+ /** Determine if this datafile is on a Raw Device
+ @return true if it is a RAW device. */
+ bool is_raw_device()
+ {
+ return(m_type != SRV_NOT_RAW);
+ }
+
+ /* DATA MEMBERS */
+
+protected:
+ /** Physical file path with base name and extension */
+ char* m_filepath;
+
+private:
+ /** Determine the space id of the given file descriptor by reading
+ a few pages from the beginning of the .ibd file.
+ @return DB_SUCCESS if space id was successfully identified,
+ else DB_ERROR. */
+ dberr_t find_space_id();
+
+ /** Points into m_filepath to the file name with extension */
+ char* m_filename;
+
+ /** Open file handle */
+ pfs_os_file_t m_handle;
+
+ /** Flags to use for opening the data file */
+ os_file_create_t m_open_flags;
+
+ /** size in megabytes or pages; converted from megabytes to
+ pages in SysTablespace::normalize_size() */
+ uint32_t m_size;
+
+ /** ordinal position of this datafile in the tablespace */
+ ulint m_order;
+
+ /** The type of the data file */
+ device_t m_type;
+
+ /** Tablespace ID. Contained in the datafile header.
+ If this is a system tablespace, FSP_SPACE_ID is only valid
+ in the first datafile. */
+ uint32_t m_space_id;
+
+ /** Tablespace flags. Contained in the datafile header.
+ If this is a system tablespace, FSP_SPACE_FLAGS are only valid
+ in the first datafile. */
+ uint32_t m_flags;
+
+ /** true if file already existed on startup */
+ bool m_exists;
+
+ /* true if the tablespace is valid */
+ bool m_is_valid;
+
+ /** Aligned buffer to hold first page */
+ byte* m_first_page;
+
+protected:
+ /** Last OS error received so it can be reported if needed. */
+ ulint m_last_os_error;
+
+public:
+ /** true if table is deferred during recovery */
+ bool m_defer=false;
+ /** Use the following to determine the uniqueness of this datafile. */
+#ifdef _WIN32
+ /* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */
+ BY_HANDLE_FILE_INFORMATION m_file_info;
+#else
+ /* Use field st_ino. */
+ struct stat m_file_info;
+#endif /* WIN32 */
+};
+
+
+/** Data file control information. */
+class RemoteDatafile : public Datafile
+{
+private:
+ /** Link filename (full path) */
+ char* m_link_filepath;
+
+public:
+
+ RemoteDatafile()
+ :
+ m_link_filepath()
+ {
+ /* No op - base constructor is called. */
+ }
+
+ RemoteDatafile(const char*, ulint, ulint)
+ :
+ m_link_filepath()
+ {
+ /* No op - base constructor is called. */
+ }
+
+ ~RemoteDatafile() override
+ {
+ shutdown();
+ }
+
+ /** Release the resources. */
+ void shutdown() override;
+
+ /** Get the link filepath.
+ @return m_link_filepath */
+ const char* link_filepath() const
+ {
+ return(m_link_filepath);
+ }
+
+ /** Attempt to read the contents of an .isl file into m_filepath.
+ @param name table name
+ @return filepath()
+ @retval nullptr if the .isl file does not exist or cannot be read */
+ const char* open_link_file(const fil_space_t::name_type name);
+
+ /** Delete an InnoDB Symbolic Link (ISL) file. */
+ void delete_link_file(void);
+
+ /******************************************************************
+ Global Static Functions; Cannot refer to data members.
+ ******************************************************************/
+
+ /** Create InnoDB Symbolic Link (ISL) file.
+ @param name tablespace name
+ @param filepath full file name
+ @return DB_SUCCESS or error code */
+ static dberr_t create_link_file(fil_space_t::name_type name,
+ const char *filepath);
+
+ /** Delete an InnoDB Symbolic Link (ISL) file by name.
+ @param name tablespace name */
+ static void delete_link_file(fil_space_t::name_type name);
+};
+#endif /* fsp0file_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000..26261554
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.h
@@ -0,0 +1,762 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "assume_aligned.h"
+#include "fsp0types.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0mtr.h"
+#include "page0types.h"
+#include "rem0types.h"
+#else
+# include "mach0data.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size */
+#define FSP_FLAGS_PAGE_SSIZE() \
+ ((srv_page_size == UNIV_PAGE_SIZE_ORIG) ? \
+ 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \
+ << FSP_FLAGS_POS_PAGE_SSIZE)
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size in
+full checksum format */
+#define FSP_FLAGS_FCRC32_PAGE_SSIZE() \
+ ((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \
+ << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20;
+see the table in fsp0types.h @{ */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 \
+ (FSP_FLAGS_POS_ATOMIC_BLOBS \
+ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 \
+ (FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 + 1)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 \
+ (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 + 4)
+/** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101 \
+ (FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 + 2)
+
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101 \
+ (1U << FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101 \
+ (15U << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101 \
+ (3U << FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101 \
+ (15U << FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101) \
+ >> FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/* @} */
+
+/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
+
+/** Offset of the space header within a file page */
+#define FSP_HEADER_OFFSET FIL_PAGE_DATA
+
+/* The data structures in files are defined just as byte strings in C */
+typedef byte xdes_t;
+
+/* SPACE HEADER
+ ============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID 0 /* space id */
+#define FSP_NOT_USED 4 /* this field contained a value up to
+ which we know that the modifications
+ in the database have been flushed to
+ the file space; not used now */
+#define FSP_SIZE 8 /* Current size of the space in
+ pages */
+#define FSP_FREE_LIMIT 12 /* Minimum page number for which the
+ free list has not been initialized:
+ the pages >= this limit are, by
+ definition, free; note that in a
+ single-table tablespace where size
+ < 64 pages, this number is 64, i.e.,
+ we have initialized the space
+ about the first extent, but have not
+ physically allocated those pages to the
+ file */
+#define FSP_SPACE_FLAGS 16 /* fsp_space_t.flags, similar to
+ dict_table_t::flags */
+#define FSP_FRAG_N_USED 20 /* number of used pages in the
+ FSP_FREE_FRAG list */
+#define FSP_FREE 24 /* list of free extents */
+#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE)
+ /* list of partially free extents not
+ belonging to any segment */
+#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE)
+ /* list of full extents not belonging
+ to any segment */
+#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE)
+ /* 8 bytes which give the first unused
+ segment id */
+#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE)
+ /* list of pages containing segment
+ headers, where all the segment inode
+ slots are reserved */
+#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE)
+ /* list of pages containing segment
+ headers, where not all the segment
+ header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define FSP_FREE_ADD 4 /* this many free extents are added
+ to the free list from above
+ FSP_FREE_LIMIT at a time */
+/* @} */
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/* FILE SEGMENT INODE
+ ==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef byte fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA
+ /* the list node for linking
+ segment inode pages */
+
+#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define FSEG_ID 0 /* 8 bytes of segment id: if this is 0,
+ it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED 8
+ /* number of used segment pages in
+ the FSEG_NOT_FULL list */
+#define FSEG_FREE 12
+ /* list of free extents of this
+ segment */
+#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE)
+ /* list of partially free extents */
+#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE)
+ /* list of full extents */
+#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE)
+ /* magic number used in debugging */
+#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE)
+ /* array of individual pages
+ belonging to this segment in fsp
+ fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2)
+ /* number of slots in the array for
+ the fragment pages */
+#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its
+ page number within space, FIL_NULL
+ means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE \
+ (16 + 3 * FLST_BASE_NODE_SIZE \
+ + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2};
+
+#define FSEG_FILLFACTOR 8 /* If the number of unused but reserved
+ pages in a segment is less than
+ reserved pages / FSEG_FILLFACTOR,
+ and there are
+ at least FSEG_FRAG_LIMIT used pages,
+ then we allow a new empty extent to
+ be added to the segment in
+ fseg_alloc_free_page_general().
+ Otherwise, we
+ use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS
+ /* If the segment has >= this many
+ used pages, it may be expanded by
+ allocating extents to the segment;
+ until that only individual fragment
+ pages are allocated from the space */
+
+#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment
+ is at least this many extents, we
+ allow extents to be put to the free
+ list of the extent: at most
+ FSEG_FREE_LIST_MAX_LEN many */
+#define FSEG_FREE_LIST_MAX_LEN 4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/* EXTENT DESCRIPTOR
+ =================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define XDES_ID 0 /* The identifier of the segment
+ to which this extent belongs */
+#define XDES_FLST_NODE 8 /* The list node data structure
+ for the descriptors */
+#define XDES_STATE (FLST_NODE_SIZE + 8)
+ /* contains state information
+ of the extent */
+#define XDES_BITMAP (FLST_NODE_SIZE + 12)
+ /* Descriptor bitmap of the pages
+ in the extent */
+/*-------------------------------------*/
+
+#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */
+#define XDES_FREE_BIT 0 /* Index of the bit which tells if
+ the page is free */
+#define XDES_CLEAN_BIT 1 /* NOTE: currently not used!
+ Index of the bit which tells if
+ there are old versions of tuples
+ on the page */
+/* States of a descriptor */
+#define XDES_FREE 1 /* extent is in free list of space */
+#define XDES_FREE_FRAG 2 /* extent is in free fragment list of
+ space */
+#define XDES_FULL_FRAG 3 /* extent is in full fragment list of
+ space */
+#define XDES_FSEG 4 /* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define XDES_SIZE \
+ (XDES_BITMAP \
+ + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define XDES_SIZE_MAX \
+ (XDES_BITMAP \
+ + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define XDES_SIZE_MIN \
+ (XDES_BITMAP \
+ + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+/**
+Determine if a page is marked free.
+@param[in] descr extent descriptor
+@param[in] offset page offset within extent
+@return whether the page is free */
+inline bool xdes_is_free(const xdes_t *descr, ulint offset)
+{
+ ut_ad(offset < FSP_EXTENT_SIZE);
+ ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset;
+ return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/* @} */
+
+/** Read a tablespace header field.
+@param[in] page first page of a tablespace
+@param[in] field the header field
+@return the contents of the header field */
+inline uint32_t fsp_header_get_field(const page_t* page, ulint field)
+{
+ return mach_read_from_4(FSP_HEADER_OFFSET + field +
+ my_assume_aligned<UNIV_ZIP_SIZE_MIN>(page));
+}
+
+/** Read the flags from the tablespace header page.
+@param[in] page first page of a tablespace
+@return the contents of FSP_SPACE_FLAGS */
+inline uint32_t fsp_header_get_flags(const page_t *page)
+{
+ return fsp_header_get_field(page, FSP_SPACE_FLAGS);
+}
+
+/** Get the byte offset of encryption information in page 0.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return byte offset relative to FSP_HEADER_OFFSET */
+inline MY_ATTRIBUTE((pure, warn_unused_result))
+ulint fsp_header_get_encryption_offset(ulint zip_size)
+{
+ return zip_size
+ ? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE
+ : XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift)
+ / FSP_EXTENT_SIZE;
+}
+
+/** Check the encryption key from the first page of a tablespace.
+@param[in] fsp_flags tablespace flags
+@param[in] page first page of a tablespace
+@return true if success */
+bool
+fsp_header_check_encryption_key(
+ ulint fsp_flags,
+ page_t* page);
+
+/** Initialize a tablespace header.
+@param[in,out] space tablespace
+@param[in] size current size in blocks
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Create a new segment.
+@param space tablespace
+@param byte_offset byte offset of the created segment header
+@param mtr mini-transaction
+@param err error code
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block block where segment header is placed,
+ or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval nullptr if could not create segment */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+ bool has_done_reservation= false, buf_block_t *block= nullptr)
+ MY_ATTRIBUTE((nonnull(1,3,4), warn_unused_result));
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in] block buffer block containing the file segment header
+@param[in] header file segment header
+@param[out] used number of pages that are used (not more than reserved)
+@param[in,out] mtr mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+ const fseg_header_t *header, ulint *used,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in/out: segment header */
+ uint32_t hint, /*!< in: hint of which page would be
+ desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ bool has_done_reservation, /*!< in: true if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ mtr_t* init_mtr,/*!< in/out: mtr or another mini-transaction
+ in which the page should be initialized. */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out] n_reserved number of extents actually reserved; if we
+ return true and the tablespace size is <
+ FSP_EXTENT_SIZE pages, then this can be 0,
+ otherwise it is n_ext
+@param[in,out] space tablespace
+@param[in] n_ext number of extents to reserve
+@param[in] alloc_type page reservation type (FSP_BLOB, etc)
+@param[in,out] mtr the mini transaction
+@param[out] err error code
+@param[in] n_pages for small tablespaces (tablespace size is
+ less than FSP_EXTENT_SIZE), number of free
+ pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
+fsp_reserve_free_extents(
+ uint32_t* n_reserved,
+ fil_space_t* space,
+ uint32_t n_ext,
+ fsp_reserve_t alloc_type,
+ mtr_t* mtr,
+ uint32_t n_pages = 2);
+
+/** Free a page in a file segment.
+@param[in,out] seg_header file segment header
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction
+@param[in] have_latch whether space->x_lock() was already called
+@return error code */
+dberr_t
+fseg_free_page(
+ fseg_header_t* seg_header,
+ fil_space_t* space,
+ uint32_t offset,
+ mtr_t* mtr,
+ bool have_latch = false)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine whether a page is allocated.
+@param space tablespace
+@param page page number
+@return error code
+@retval DB_SUCCESS if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param header segment header; NOTE: if the header resides on first
+ page of the frag list of the segment, this pointer
+ becomes obsolete after the last freeing step
+@param mtr mini-transaction
+@param ahi Drop the adaptive hash index
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+ fseg_header_t* header,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Frees part of a segment. Differs from fseg_free_step because
+this function leaves the header page unfreed.
+@param header segment header which must reside on the first
+ fragment page of the segment
+@param mtr mini-transaction
+@param ahi drop the adaptive hash index
+@return whether the freeing was completed, except for the header page */
+bool
+fseg_free_step_not_header(
+ fseg_header_t* header,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in] block block with invalid FIL_PAGE_TYPE
+@param[in] type expected page type
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr);
+
+/** Check (and if needed, reset) the page type.
+Data files created before MySQL 5.1.48 may contain
+garbage in the FIL_PAGE_TYPE field.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in] page_id page number
+@param[in,out] page page with possibly invalid FIL_PAGE_TYPE
+@param[in] type expected page type
+@param[in,out] mtr mini-transaction */
+inline void
+fil_block_check_type(
+ const buf_block_t& block,
+ ulint type,
+ mtr_t* mtr)
+{
+ if (UNIV_UNLIKELY(type != fil_page_get_type(block.page.frame)))
+ fil_block_reset_type(block, type, mtr);
+}
+
+/** Checks if a page address is an extent descriptor page address.
+@param[in] page_id page id
+@param[in] physical_size page size
+@return whether a descriptor page */
+inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size)
+{
+ return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET;
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out] block buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block);
+
+/** Initialize a file page.
+@param[in] space tablespace
+@param[in,out] block file page
+@param[in,out] mtr mini-transaction */
+inline void fsp_init_file_page(
+#ifdef UNIV_DEBUG
+ const fil_space_t* space,
+#endif
+ buf_block_t* block, mtr_t* mtr)
+{
+ ut_d(space->modify_check(*mtr));
+ ut_ad(space->id == block->page.id().space());
+ fsp_apply_init_file_page(block);
+ mtr->init(block);
+}
+
+#ifndef UNIV_DEBUG
+# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr)
+#endif
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+#endif /* UNIV_BTR_PRINT */
+
+/** Convert FSP_SPACE_FLAGS from the buggy MariaDB 10.1.0..10.1.20 format.
+@param[in] flags the contents of FSP_SPACE_FLAGS
+@return the flags corrected from the buggy MariaDB 10.1 format
+@retval UINT32_MAX if the flags are not in the buggy 10.1 format */
+MY_ATTRIBUTE((warn_unused_result, const))
+inline uint32_t fsp_flags_convert_from_101(uint32_t flags)
+{
+ DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return UINT32_MAX;);
+ if (flags == 0 || fil_space_t::full_crc32(flags)) {
+ return(flags);
+ }
+
+ if (flags >> 18) {
+ /* The most significant FSP_SPACE_FLAGS bit that was ever set
+ by MariaDB 10.1.0 to 10.1.20 was bit 17 (misplaced DATA_DIR flag).
+ The flags must be less than 1<<18 in order to be valid. */
+ return UINT32_MAX;
+ }
+
+ if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+ == FSP_FLAGS_MASK_ATOMIC_BLOBS) {
+ /* If the "atomic blobs" flag (indicating
+ ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag
+ is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag
+ must also be set. */
+ return UINT32_MAX;
+ }
+
+ /* Bits 6..10 denote compression in MariaDB 10.1.0 to 10.1.20.
+ They must be either 0b00000 or 0b00011 through 0b10011.
+ In correct versions, these bits would be
+ 0bd0sss where d is the DATA_DIR flag (garbage bit) and
+ sss is the PAGE_SSIZE (3, 4, 6, or 7).
+
+ NOTE: MariaDB 10.1.0 to 10.1.20 can misinterpret
+ uncompressed data files with innodb_page_size=4k or 64k as
+ compressed innodb_page_size=16k files. Below is an exhaustive
+ state space analysis.
+
+ -0by1zzz: impossible (the bit 4 must be clean; see above)
+ -0b101xx: DATA_DIR, innodb_page_size>4k: invalid (COMPRESSION_LEVEL>9)
+ +0bx0011: innodb_page_size=4k:
+ !!! Misinterpreted as COMPRESSION_LEVEL=9 or 1, COMPRESSION=1.
+ -0bx0010: impossible, because sss must be 0b011 or 0b1xx
+ -0bx0001: impossible, because sss must be 0b011 or 0b1xx
+ -0b10000: DATA_DIR, innodb_page_size=16:
+ invalid (COMPRESSION_LEVEL=8 but COMPRESSION=0)
+ +0b00111: no DATA_DIR, innodb_page_size=64k:
+ !!! Misinterpreted as COMPRESSION_LEVEL=3, COMPRESSION=1.
+ -0b00101: impossible, because sss must be 0 for 16k, not 0b101
+ -0b001x0: no DATA_DIR, innodb_page_size=32k or 8k:
+ invalid (COMPRESSION_LEVEL=3 but COMPRESSION=0)
+ +0b00000: innodb_page_size=16k (looks like COMPRESSION=0)
+ ??? Could actually be compressed; see PAGE_SSIZE below */
+ const uint32_t level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(
+ flags);
+ if (FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) != (level != 0)
+ || level > 9) {
+ /* The compression flags are not in the buggy MariaDB
+ 10.1 format. */
+ return UINT32_MAX;
+ }
+ if (!(~flags & FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101)) {
+ /* The ATOMIC_WRITES flags cannot be 0b11.
+ (The bits 11..12 should actually never be 0b11,
+ because in MySQL they would be SHARED|TEMPORARY.) */
+ return UINT32_MAX;
+ }
+
+ /* Bits 13..16 are the wrong position for PAGE_SSIZE, and they
+ should contain one of the values 3,4,6,7, that is, be of the form
+ 0b0011 or 0b01xx (except 0b0101).
+ In correct versions, these bits should be 0bc0se
+ where c is the MariaDB COMPRESSED flag
+ and e is the MySQL 5.7 ENCRYPTION flag
+ and s is the MySQL 8.0 SDI flag. MariaDB can only support s=0, e=0.
+
+ Compressed innodb_page_size=16k tables with correct FSP_SPACE_FLAGS
+ will be properly rejected by older MariaDB 10.1.x because they
+ would read as PAGE_SSIZE>=8 which is not valid. */
+
+ const uint32_t ssize = FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags);
+ if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) {
+ /* the page_size is not between 4k and 64k;
+ 16k should be encoded as 0, not 5 */
+ return UINT32_MAX;
+ }
+ const uint32_t zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ if (zssize == 0) {
+ /* not ROW_FORMAT=COMPRESSED */
+ } else if (zssize > (ssize ? ssize : 5)) {
+ /* invalid KEY_BLOCK_SIZE */
+ return UINT32_MAX;
+ } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE
+ | FSP_FLAGS_MASK_ATOMIC_BLOBS)) {
+ /* both these flags should be set for
+ ROW_FORMAT=COMPRESSED */
+ return UINT32_MAX;
+ }
+
+ flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE
+ | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags)
+ << FSP_FLAGS_POS_PAGE_COMPRESSION);
+ ut_ad(fil_space_t::is_valid_flags(flags, false));
+ return(flags);
+}
+
+/** Compare tablespace flags.
+@param[in] expected expected flags from dict_tf_to_fsp_flags()
+@param[in] actual flags read from FSP_SPACE_FLAGS
+@return whether the flags match */
+MY_ATTRIBUTE((warn_unused_result))
+inline bool fsp_flags_match(uint32_t expected, uint32_t actual)
+{
+ expected&= ~FSP_FLAGS_MEM_MASK;
+ ut_ad(fil_space_t::is_valid_flags(expected, false));
+ return actual == expected || fsp_flags_convert_from_101(actual) == expected;
+}
+
+/** Determine if FSP_SPACE_FLAGS are from an incompatible MySQL format.
+@param flags the contents of FSP_SPACE_FLAGS
+@return MySQL flags shifted.
+@retval 0, if not a MySQL incompatible format. */
+MY_ATTRIBUTE((warn_unused_result, const))
+inline uint32_t fsp_flags_is_incompatible_mysql(uint32_t flags)
+{
+ /*
+ MySQL-8.0 SDI flag (bit 14),
+ or MySQL 5.7 Encyption flag (bit 13)
+ */
+ return flags >> 13 & 3;
+}
+
+/** Determine the descriptor index within a descriptor page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] offset page offset
+@return descriptor index */
+inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset)
+{
+ return ut_2pow_remainder<ulint>(offset,
+ zip_size ? zip_size : srv_page_size)
+ / FSP_EXTENT_SIZE;
+}
+
+/** Determine the descriptor page number for a page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] offset page offset
+@return descriptor page offset */
+inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset)
+{
+ compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET
+ + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)
+ * XDES_SIZE_MAX);
+ compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET
+ + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN)
+ * XDES_SIZE_MIN);
+
+ ut_ad(srv_page_size > XDES_ARR_OFFSET
+ + (srv_page_size / FSP_EXTENT_SIZE)
+ * XDES_SIZE);
+ ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+ + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+ * XDES_SIZE);
+ ut_ad(!zip_size
+ || zip_size > XDES_ARR_OFFSET
+ + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+ return ut_2pow_round(offset,
+ uint32_t(zip_size ? zip_size : srv_page_size));
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
new file mode 100644
index 00000000..a2bb46d3
--- /dev/null
+++ b/storage/innobase/include/fsp0space.h
@@ -0,0 +1,209 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0space.h
+Shared tablespace interface
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0space_h
+#define fsp0space_h
+
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "fsp0types.h"
+
+#include <vector>
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class Tablespace {
+
+public:
+ typedef std::vector<Datafile, ut_allocator<Datafile> > files_t;
+
+ /** Data file information - each Datafile can be accessed globally */
+ files_t m_files;
+ /** Data file iterator */
+ typedef files_t::iterator iterator;
+ /** Data file iterator */
+ typedef files_t::const_iterator const_iterator;
+
+ Tablespace() {}
+
+ virtual ~Tablespace()
+ {
+ shutdown();
+ ut_ad(m_files.empty());
+ ut_ad(m_space_id == UINT32_MAX);
+ }
+
+ // Disable copying
+ Tablespace(const Tablespace&);
+ Tablespace& operator=(const Tablespace&);
+
+ /** Data file iterator */
+ const_iterator begin() const { return m_files.begin(); }
+ /** Data file iterator */
+ const_iterator end() const { return m_files.end(); }
+ /** Data file iterator */
+ iterator begin() { return m_files.begin(); }
+ /** Data file iterator */
+ iterator end() { return m_files.end(); }
+
+ /** Set tablespace path and filename members.
+ @param[in] path where tablespace file(s) resides
+ @param[in] len length of the file path */
+ void set_path(const char* path, size_t len)
+ {
+ ut_ad(m_path == NULL);
+ m_path = mem_strdupl(path, len);
+ ut_ad(m_path != NULL);
+ }
+
+ /** Set tablespace path and filename members.
+ @param[in] path where tablespace file(s) resides */
+ void set_path(const char* path)
+ {
+ set_path(path, strlen(path));
+ }
+
+ /** Get tablespace path
+ @return tablespace path */
+ const char* path() const
+ {
+ return(m_path);
+ }
+
+ /** Set the space id of the tablespace
+ @param[in] space_id tablespace ID to set */
+ void set_space_id(uint32_t space_id)
+ {
+ ut_ad(m_space_id == UINT32_MAX);
+ m_space_id = space_id;
+ }
+
+ /** Get the space id of the tablespace
+ @return m_space_id space id of the tablespace */
+ uint32_t space_id() const { return m_space_id; }
+
+ /** Set the tablespace flags
+ @param[in] fsp_flags tablespace flags */
+ void set_flags(uint32_t fsp_flags)
+ {
+ ut_ad(fil_space_t::is_valid_flags(fsp_flags, false));
+ m_flags = fsp_flags;
+ }
+
+ /** Get the tablespace flags
+ @return m_flags tablespace flags */
+ uint32_t flags() const { return m_flags; }
+
+ /** Get the tablespace encryption mode
+ @return m_mode tablespace encryption mode */
+ fil_encryption_t encryption_mode() const { return m_mode; }
+
+ /** Get the tablespace encryption key_id
+ @return m_key_id tablespace encryption key_id */
+ uint32_t key_id() const { return m_key_id; }
+
+ /** Set Ignore Read Only Status for tablespace.
+ @param[in] read_only_status read only status indicator */
+ void set_ignore_read_only(bool read_only_status)
+ {
+ m_ignore_read_only = read_only_status;
+ }
+
+ /** Free the memory allocated by the Tablespace object */
+ void shutdown();
+
+ /** @return the sum of the file sizes of each Datafile */
+ uint32_t get_sum_of_sizes() const
+ {
+ uint32_t sum = 0;
+
+ for (const_iterator it = begin(); it != end(); ++it) {
+ sum += it->m_size;
+ }
+
+ return(sum);
+ }
+
+ /** Open or Create the data files if they do not exist.
+ @param[in] is_temp whether this is a temporary tablespace
+ @return DB_SUCCESS or error code */
+ dberr_t open_or_create(bool is_temp)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Delete all the data files. */
+ void delete_files();
+
+ /** Check if two tablespaces have common data file names.
+ @param[in] other_space Tablespace to check against this.
+ @return true if they have the same data filenames and paths */
+ bool intersection(const Tablespace* other_space);
+
+ /** Use the ADD DATAFILE path to create a Datafile object and add
+ it to the front of m_files. Parse the datafile path into a path
+ and a basename with extension 'ibd'. This datafile_path provided
+ may be an absolute or relative path, but it must end with the
+ extension .ibd and have a basename of at least 1 byte.
+
+ Set tablespace m_path member and add a Datafile with the filename.
+ @param[in] datafile_path full path of the tablespace file. */
+ dberr_t add_datafile(
+ const char* datafile_path);
+
+ /* Return a pointer to the first Datafile for this Tablespace
+ @return pointer to the first Datafile for this Tablespace*/
+ Datafile* first_datafile()
+ {
+ ut_a(!m_files.empty());
+ return(&m_files.front());
+ }
+private:
+ /**
+ @param[in] filename Name to lookup in the data files.
+ @return true if the filename exists in the data files */
+ bool find(const char* filename) const;
+
+ /** Note that the data file was found.
+ @param[in] file data file object */
+ void file_found(Datafile& file);
+
+ /** Tablespace ID */
+ uint32_t m_space_id = UINT32_MAX;
+ /** Tablespace flags */
+ uint32_t m_flags = UINT32_MAX;
+
+ /** Path where tablespace files will reside, excluding a filename */
+ char* m_path;
+
+ /** Encryption mode and key_id */
+ fil_encryption_t m_mode;
+ uint32_t m_key_id;
+
+protected:
+ /** Ignore server read only configuration for this tablespace. */
+ bool m_ignore_read_only = false;
+};
+
+#endif /* fsp0space_h */
diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h
new file mode 100644
index 00000000..514f3fdb
--- /dev/null
+++ b/storage/innobase/include/fsp0sysspace.h
@@ -0,0 +1,278 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0sysspace.h
+Multi file, shared, system tablespace implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0sysspace_h
+#define fsp0sysspace_h
+
+#include "fsp0space.h"
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+extern uint sys_tablespace_auto_extend_increment;
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class SysTablespace : public Tablespace
+{
+public:
+
+ SysTablespace()
+ :
+ m_auto_extend_last_file(),
+ m_last_file_size_max(),
+ m_created_new_raw(),
+ m_is_tablespace_full(false),
+ m_sanity_checks_done(false)
+ {
+ /* No op */
+ }
+
+ ~SysTablespace() override
+ {
+ shutdown();
+ }
+
+ /** Set tablespace full status
+ @param[in] is_full true if full */
+ void set_tablespace_full_status(bool is_full)
+ {
+ m_is_tablespace_full = is_full;
+ }
+
+ /** Get tablespace full status
+ @return true if table is full */
+ bool get_tablespace_full_status()
+ {
+ return(m_is_tablespace_full);
+ }
+
+ /** Set sanity check status
+ @param[in] status true if sanity checks are done */
+ void set_sanity_check_status(bool status)
+ {
+ m_sanity_checks_done = status;
+ }
+
+ /** Get sanity check status
+ @return true if sanity checks are done */
+ bool get_sanity_check_status()
+ {
+ return(m_sanity_checks_done);
+ }
+
+ /** Parse the input params and populate member variables.
+ @param filepath path to data files
+ @param supports_raw true if it supports raw devices
+ @return true on success parse */
+ bool parse_params(const char* filepath, bool supports_raw);
+
+ /** Check the data file specification.
+ @param[out] create_new_db true if a new database
+ is to be created
+ @param[in] min_expected_size expected tablespace
+ size in bytes
+ @return DB_SUCCESS if all OK else error code */
+ dberr_t check_file_spec(
+ bool* create_new_db,
+ ulint min_expected_tablespace_size);
+
+ /** Free the memory allocated by parse() */
+ void shutdown();
+
+ /** Normalize the file size, convert to extents. */
+ void normalize_size();
+
+ /**
+ @return true if a new raw device was created. */
+ bool created_new_raw() const
+ {
+ return(m_created_new_raw);
+ }
+
+ /**
+ @return auto_extend value setting */
+ ulint can_auto_extend_last_file() const
+ {
+ return(m_auto_extend_last_file);
+ }
+
+ /** Set the last file size.
+ @param[in] size the size to set */
+ void set_last_file_size(uint32_t size)
+ {
+ ut_ad(!m_files.empty());
+ m_files.back().m_size = size;
+ }
+
+ /** Get the size of the last data file in the tablespace
+ @return the size of the last data file in the array */
+ uint32_t last_file_size() const
+ {
+ ut_ad(!m_files.empty());
+ return(m_files.back().m_size);
+ }
+
+ /**
+ @return the autoextend increment in pages. */
+ uint32_t get_autoextend_increment() const
+ {
+ return sys_tablespace_auto_extend_increment
+ << (20 - srv_page_size_shift);
+ }
+
+ /**
+ @return next increment size */
+ uint32_t get_increment() const;
+
+ /** Open or create the data files
+ @param[in] is_temp whether this is a temporary tablespace
+ @param[in] create_new_db whether we are creating a new database
+ @param[out] sum_new_sizes sum of sizes of the new files added
+ @return DB_SUCCESS or error code */
+ dberr_t open_or_create(
+ bool is_temp,
+ bool create_new_db,
+ ulint* sum_new_sizes)
+ MY_ATTRIBUTE((warn_unused_result));
+
+private:
+ /** Check the tablespace header for this tablespace.
+ @return DB_SUCCESS or error code */
+ inline dberr_t read_lsn_and_check_flags();
+
+ /**
+ @return true if the last file size is valid. */
+ bool is_valid_size() const
+ {
+ return(m_last_file_size_max >= last_file_size());
+ }
+
+ /**
+ @return true if configured to use raw devices */
+ bool has_raw_device();
+
+ /** Note that the data file was not found.
+ @param[in] file data file object
+ @param[out] create_new_db true if a new instance to be created
+ @return DB_SUCESS or error code */
+ dberr_t file_not_found(Datafile& file, bool* create_new_db);
+
+ /** Note that the data file was found.
+ @param[in,out] file data file object
+ @return true if a new instance to be created */
+ bool file_found(Datafile& file);
+
+ /** Create a data file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t create(Datafile& file);
+
+ /** Create a data file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t create_file(Datafile& file);
+
+ /** Open a data file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t open_file(Datafile& file);
+
+ /** Set the size of the file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t set_size(Datafile& file);
+
+ /** Convert a numeric string that optionally ends in G or M, to a
+ number containing megabytes.
+ @param[in] ptr string with a quantity in bytes
+ @param[out] megs the number in megabytes
+ @return next character in string */
+ static char* parse_units(char* ptr, ulint* megs);
+
+private:
+ enum file_status_t {
+ FILE_STATUS_VOID = 0, /** status not set */
+ FILE_STATUS_RW_PERMISSION_ERROR,/** permission error */
+ FILE_STATUS_READ_WRITE_ERROR, /** not readable/writable */
+ FILE_STATUS_NOT_REGULAR_FILE_ERROR /** not a regular file */
+ };
+
+ /** Verify the size of the physical file
+ @param[in] file data file object
+ @return DB_SUCCESS if OK else error code. */
+ dberr_t check_size(Datafile& file);
+
+ /** Check if a file can be opened in the correct mode.
+ @param[in,out] file data file object
+ @param[out] reason exact reason if file_status check failed.
+ @return DB_SUCCESS or error code. */
+ dberr_t check_file_status(
+ const Datafile& file,
+ file_status_t& reason);
+
+ /* DATA MEMBERS */
+
+ /** if true, then we auto-extend the last data file */
+ bool m_auto_extend_last_file;
+
+ /** maximum size of the last data file (0=unlimited) */
+ ulint m_last_file_size_max;
+
+ /** If the following is true we do not allow
+ inserts etc. This protects the user from forgetting
+ the 'newraw' keyword to my.cnf */
+ bool m_created_new_raw;
+
+ /** Tablespace full status */
+ bool m_is_tablespace_full;
+
+ /** if false, then sanity checks are still pending */
+ bool m_sanity_checks_done;
+};
+
+/* GLOBAL OBJECTS */
+
+/** The control info of the system tablespace. */
+extern SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+extern SysTablespace srv_tmp_space;
+
+/** Check if the space_id is for a system-tablespace (shared + temp).
+@param[in] id Space ID to check
+@return true if id is a system tablespace, false if not. */
+inline bool is_system_tablespace(uint32_t id)
+{
+ return id == TRX_SYS_SPACE || id == SRV_TMP_SPACE_ID;
+}
+
+/** Check if predefined shared tablespace.
+@return true if predefined shared tablespace */
+inline bool is_predefined_tablespace(uint32_t id)
+{
+ return is_system_tablespace(id) || srv_is_undo_tablespace(id);
+}
+#endif /* fsp0sysspace_h */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
new file mode 100644
index 00000000..9a23e840
--- /dev/null
+++ b/storage/innobase/include/fsp0types.h
@@ -0,0 +1,404 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include "ut0byte.h"
+
+/** All persistent tablespaces have a smaller fil_space_t::id than this. */
+constexpr uint32_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0U;
+/** The fil_space_t::id of the innodb_temporary tablespace. */
+constexpr uint32_t SRV_TMP_SPACE_ID= 0xFFFFFFFEU;
+
+/* Possible values of innodb_compression_algorithm */
+#define PAGE_UNCOMPRESSED 0
+#define PAGE_ZLIB_ALGORITHM 1
+#define PAGE_LZ4_ALGORITHM 2
+#define PAGE_LZO_ALGORITHM 3
+#define PAGE_LZMA_ALGORITHM 4
+#define PAGE_BZIP2_ALGORITHM 5
+#define PAGE_SNAPPY_ALGORITHM 6
+#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM
+
+extern const char *page_compression_algorithms[];
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page_general) */
+/* @{ */
+#define FSP_UP ((byte)111) /*!< alphabetically upwards */
+#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */
+#define FSP_NO_DIR ((byte)113) /*!< no order */
+/* @} */
+
+/** File space extent size in pages
+page size | file space extent size
+----------+-----------------------
+ 4 KiB | 256 pages = 1 MiB
+ 8 KiB | 128 pages = 1 MiB
+ 16 KiB | 64 pages = 1 MiB
+ 32 KiB | 64 pages = 2 MiB
+ 64 KiB | 64 pages = 4 MiB
+*/
+#define FSP_EXTENT_SIZE (srv_page_size_shift < 14 ? \
+ (1048576U >> srv_page_size_shift) : 64U)
+
+/** File space extent size (four megabyte) in pages for MAX page size */
+#define FSP_EXTENT_SIZE_MAX (4194304 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN)
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef byte fseg_header_t;
+
+#define FSEG_HDR_SPACE 0 /*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */
+#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE 10 /*!< Length of the file system
+ header, in bytes */
+/* @} */
+
+#ifndef UNIV_INNOCHECKSUM
+#ifdef UNIV_DEBUG
+
+struct mtr_t;
+
+/** A wrapper class to print the file segment header information. */
+class fseg_header
+{
+public:
+ /** Constructor of fseg_header.
+ @param[in] header the underlying file segment header object
+ @param[in] mtr the mini-transaction. No redo logs are
+ generated, only latches are checked within
+ mini-transaction */
+ fseg_header(
+ const fseg_header_t* header,
+ mtr_t* mtr)
+ :
+ m_header(header),
+ m_mtr(mtr)
+ {}
+
+ /** Print the file segment header to the given output stream.
+ @param[in,out] out the output stream into which the object
+ is printed.
+ @retval the output stream into which the object was printed. */
+ std::ostream&
+ to_stream(std::ostream& out) const;
+private:
+ /** The underlying file segment header */
+ const fseg_header_t* m_header;
+
+ /** The mini transaction, which is used mainly to check whether
+ appropriate latches have been taken by the calling thread. */
+ mtr_t* m_mtr;
+};
+
+/* Overloading the global output operator to print a file segment header
+@param[in,out] out the output stream into which object will be printed
+@param[in] header the file segment header to be printed
+@retval the output stream */
+inline
+std::ostream&
+operator<<(
+ std::ostream& out,
+ const fseg_header& header)
+{
+ return(header.to_stream(out));
+}
+#endif /* UNIV_DEBUG */
+
+/** Flags for fsp_reserve_free_extents */
+enum fsp_reserve_t {
+ FSP_NORMAL, /* reservation during normal B-tree operations */
+ FSP_UNDO, /* reservation done for undo logging */
+ FSP_CLEANING, /* reservation done during purge operations */
+ FSP_BLOB /* reservation being done for BLOB insertion */
+};
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE srv_page_size */
+/* This has been replaced with either srv_page_size or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET 0U /* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */
+ /* The ibuf bitmap pages are the ones whose
+ page number is the number above plus a
+ multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */
+ /* The following pages exist
+ in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer
+ header page, in
+ tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer
+ B-tree root page in
+ tablespace 0 */
+ /* The ibuf tree root page number in
+ tablespace 0; its fseg inode is on the page
+ number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction
+ system header, in
+ tablespace 0 */
+#define FSP_FIRST_RSEG_PAGE_NO 6U /*!< first rollback segment
+ page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO 7U /*!< data dictionary header
+ page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+/** Check if tablespace is system temporary.
+@param[in] space_id verify is checksum is enabled for given space.
+@return true if tablespace is system temporary. */
+inline
+bool
+fsp_is_system_temporary(ulint space_id)
+{
+ return(space_id == SRV_TMP_SPACE_ID);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
+
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE 1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE 4
+/** Width of the ATOMIC_BLOBS flag. The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_WIDTH_PAGE_SSIZE 4
+/** Number of reserved bits */
+#define FSP_FLAGS_WIDTH_RESERVED 6
+/** Number of flag bits used to indicate the page compression */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1
+
+/** Width of all the currently known persistent tablespace flags */
+#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \
+ + FSP_FLAGS_WIDTH_ZIP_SSIZE \
+ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \
+ + FSP_FLAGS_WIDTH_PAGE_SSIZE \
+ + FSP_FLAGS_WIDTH_RESERVED \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+
+/** A mask of all the known/used bits in FSP_SPACE_FLAGS */
+#define FSP_FLAGS_MASK (~(~0U << FSP_FLAGS_WIDTH))
+
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE 4
+
+/** Marker to indicate whether tablespace is in full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_MARKER 1
+
+/** Stores the compressed algo for full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO 3
+
+/* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older
+and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21
+or newer.
+MySQL 5.6 MariaDB 10.1.x MariaDB 10.1.21
+====================================================================
+Below flags in same offset
+====================================================================
+0: POST_ANTELOPE 0:POST_ANTELOPE 0: POST_ANTELOPE
+1..4: ZIP_SSIZE(0..5) 1..4:ZIP_SSIZE(0..5) 1..4: ZIP_SSIZE(0..5)
+(NOTE: bit 4 is always 0)
+5: ATOMIC_BLOBS 5:ATOMIC_BLOBS 5: ATOMIC_BLOBS
+=====================================================================
+Below note the order difference:
+=====================================================================
+6..9: PAGE_SSIZE(3..7) 6: COMPRESSION 6..9: PAGE_SSIZE(3..7)
+10: DATA_DIR 7..10: COMP_LEVEL(0..9) 10: RESERVED (5.6 DATA_DIR)
+=====================================================================
+The flags below were in incorrect position in MariaDB 10.1,
+or have been introduced in MySQL 5.7 or 8.0:
+=====================================================================
+11: UNUSED 11..12:ATOMIC_WRITES 11: RESERVED (5.7 SHARED)
+ 12: RESERVED (5.7 TEMPORARY)
+ 13..15:PAGE_SSIZE(3..7) 13: RESERVED (5.7 ENCRYPTION)
+ 14: RESERVED (8.0 SDI)
+ 15: RESERVED
+ 16: PAGE_SSIZE_msb(0) 16: COMPRESSION
+ 17: DATA_DIR 17: UNUSED
+ 18: UNUSED
+=====================================================================
+The flags below only exist in fil_space_t::flags, not in FSP_SPACE_FLAGS:
+=====================================================================
+ 27: DATA_DIR
+ 28..31: COMPRESSION_LEVEL
+*/
+
+/** A mask of the memory-only flags in fil_space_t::flags */
+#define FSP_FLAGS_MEM_MASK (~0U << FSP_FLAGS_MEM_DATA_DIR)
+
+/** Zero relative shift position of the DATA_DIR flag */
+#define FSP_FLAGS_MEM_DATA_DIR 27
+/** Zero relative shift position of the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MEM_COMPRESSION_LEVEL 28
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE 0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE (FSP_FLAGS_POS_POST_ANTELOPE \
+ + FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \
+ + FSP_FLAGS_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the start of the PAGE_SSIZE bits */
+#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \
+ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the RESERVED bits
+these are only used in MySQL 5.7 and used for compatibility. */
+#define FSP_FLAGS_POS_RESERVED (FSP_FLAGS_POS_PAGE_SSIZE \
+ + FSP_FLAGS_WIDTH_PAGE_SSIZE)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_RESERVED \
+ + FSP_FLAGS_WIDTH_RESERVED)
+
+/** Zero relative shift position of the PAGE_SIZE field
+in full crc32 format */
+#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE 0
+
+/** Zero relative shift position of the MARKER field in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_MARKER (FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \
+ + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)
+
+/** Zero relative shift position of the compressed algorithm stored
+in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO (FSP_FLAGS_FCRC32_POS_MARKER \
+ + FSP_FLAGS_FCRC32_WIDTH_MARKER)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE \
+ ((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \
+ << FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE \
+ ((~(~0U << FSP_FLAGS_WIDTH_ZIP_SSIZE)) \
+ << FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS \
+ ((~(~0U << FSP_FLAGS_WIDTH_ATOMIC_BLOBS)) \
+ << FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE \
+ ((~(~0U << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \
+ << FSP_FLAGS_POS_PAGE_SSIZE)
+/** Bit mask of the RESERVED1 field */
+#define FSP_FLAGS_MASK_RESERVED \
+ ((~(~0U << FSP_FLAGS_WIDTH_RESERVED)) \
+ << FSP_FLAGS_POS_RESERVED)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION \
+ ((~(~0U << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \
+ << FSP_FLAGS_POS_PAGE_COMPRESSION)
+
+/** Bit mask of the in-memory COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL \
+ (15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/** Bit mask of the PAGE_SIZE field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE \
+ ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)) \
+ << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/** Bit mask of the MARKER field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_MARKER \
+ ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER)) \
+ << FSP_FLAGS_FCRC32_POS_MARKER)
+
+/** Bit mask of the COMPRESSED ALGO field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO \
+ ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO)) \
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \
+ ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \
+ >> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags) \
+ ((flags & FSP_FLAGS_MASK_ZIP_SSIZE) \
+ >> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags) \
+ ((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS) \
+ >> FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \
+ >> FSP_FLAGS_POS_PAGE_SSIZE)
+/** @return the RESERVED flags */
+#define FSP_FLAGS_GET_RESERVED(flags) \
+ ((flags & FSP_FLAGS_MASK_RESERVED) \
+ >> FSP_FLAGS_POS_RESERVED)
+/** @return the PAGE_COMPRESSION flag */
+#define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** @return the PAGE_SSIZE flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) \
+ ((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE) \
+ >> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+/** @return the COMPRESSED_ALGO flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) \
+ ((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO) \
+ >> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** @return the value of the DATA_DIR field */
+#define FSP_FLAGS_HAS_DATA_DIR(flags) \
+ (flags & 1U << FSP_FLAGS_MEM_DATA_DIR)
+/** @return the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL) \
+ >> FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/* @} */
+
+struct fil_node_t;
+struct fil_space_t;
+class buf_page_t;
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
new file mode 100644
index 00000000..15bf30bc
--- /dev/null
+++ b/storage/innobase/include/fts0ast.h
@@ -0,0 +1,340 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+
+/* The type of AST Node */
+enum fts_ast_type_t {
+ FTS_AST_OPER, /*!< Operator */
+ FTS_AST_NUMB, /*!< Number */
+ FTS_AST_TERM, /*!< Term (or word) */
+ FTS_AST_TEXT, /*!< Text string */
+ FTS_AST_PARSER_PHRASE_LIST, /*!< Phase for plugin parser
+ The difference from text type
+ is that we tokenize text into
+ term list */
+ FTS_AST_LIST, /*!< Expression list */
+ FTS_AST_SUBEXP_LIST /*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_t {
+ FTS_NONE, /*!< No operator */
+
+ FTS_IGNORE, /*!< Ignore rows that contain
+ this word */
+
+ FTS_EXIST, /*!< Include rows that contain
+ this word */
+
+ FTS_NEGATE, /*!< Include rows that contain
+ this word but rank them
+ lower*/
+
+ FTS_INCR_RATING, /*!< Increase the rank for this
+ word*/
+
+ FTS_DECR_RATING, /*!< Decrease the rank for this
+ word*/
+
+ FTS_DISTANCE, /*!< Proximity distance */
+ FTS_IGNORE_SKIP, /*!< Transient node operator
+ signifies that this is a
+ FTS_IGNORE node, and ignored in
+ the first pass of
+ fts_ast_visit() */
+ FTS_EXIST_SKIP /*!< Transient node operator
+ signifies that this ia a
+ FTS_EXIST node, and ignored in
+ the first pass of
+ fts_ast_visit() */
+};
+
+/* Data types used by the FTS parser */
+struct fts_lexer_t;
+struct fts_ast_node_t;
+struct fts_ast_state_t;
+struct fts_ast_string_t;
+
+typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+ /* out: 0 on OK, 1 on error */
+ fts_ast_state_t* state); /*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ fts_ast_oper_t oper); /*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ const fts_ast_string_t* ptr); /*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ const fts_ast_string_t* ptr); /*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ fts_ast_node_t* expr); /*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+ /* out: new node */
+ void* arg, /*!< in: ast state instance */
+ fts_ast_node_t* expr); /*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+ fts_ast_node_t* node); /*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+void
+fts_ast_text_set_distance(
+/*======================*/
+ fts_ast_node_t* node, /*!< in/out: text node */
+ ulint distance); /*!< in: the text proximity
+ distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+ fts_ast_node_t* node); /*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+ fts_ast_node_t* list, /*!< in: list node instance */
+ fts_ast_node_t* node); /*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+ fts_ast_node_t* node); /*!< in: ast node to print */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+ fts_ast_state_t*state); /*!< in: state instance
+ to free */
+/** Check only union operation involved in the node
+@param[in] node ast node to check
+@return true if the node contains only union else false. */
+bool
+fts_ast_node_check_union(
+ fts_ast_node_t* node);
+
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+dberr_t
+fts_ast_visit(
+/*==========*/
+ fts_ast_oper_t oper, /*!< in: FTS operator */
+ fts_ast_node_t* node, /*!< in: instance to traverse*/
+ fts_ast_callback visitor, /*!< in: callback */
+ void* arg, /*!< in: callback arg */
+ bool* has_ignore) /*!< out: whether we encounter
+ and ignored processing an
+ operator, currently we only
+ ignore FTS_IGNORE operator */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************
+Create a lex instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+ ibool boolean_mode, /*!< in: query type */
+ const byte* query, /*!< in: query string */
+ ulint query_len) /*!< in: query string len */
+ MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+fts_lexer_free(
+/*===========*/
+ fts_lexer_t* fts_lexer) /*!< in: lexer instance to
+ free */
+ MY_ATTRIBUTE((nonnull));
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str pointer to string
+@param[in] len length of the string
+@return ast string with NUL-terminator */
+fts_ast_string_t*
+fts_ast_string_create(
+ const byte* str,
+ ulint len);
+
+/**
+Free an ast string instance
+@param[in,out] ast_str string to free */
+void
+fts_ast_string_free(
+ fts_ast_string_t* ast_str);
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str string to translate
+@param[in] base the base
+@return translated number */
+ulint
+fts_ast_string_to_ul(
+ const fts_ast_string_t* ast_str,
+ int base);
+
+/* String of length len.
+We always store the string of length len with a terminating '\0',
+regardless of there is any 0x00 in the string itself */
+struct fts_ast_string_t {
+ /*!< Pointer to string. */
+ byte* str;
+
+ /*!< Length of the string. */
+ ulint len;
+};
+
+/* Query term type */
+struct fts_ast_term_t {
+ fts_ast_string_t* ptr; /*!< Pointer to term string.*/
+ ibool wildcard; /*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_t {
+ fts_ast_string_t* ptr; /*!< Pointer to text string.*/
+ ulint distance; /*!< > 0 if proximity distance
+ set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_t {
+ fts_ast_node_t* head; /*!< Children list head */
+ fts_ast_node_t* tail; /*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_t {
+ fts_ast_type_t type; /*!< The type of node */
+ fts_ast_text_t text; /*!< Text node */
+ fts_ast_term_t term; /*!< Term node */
+ fts_ast_oper_t oper; /*!< Operator value */
+ fts_ast_list_t list; /*!< Expression list */
+ fts_ast_node_t* next; /*!< Link for expr list */
+ fts_ast_node_t* next_alloc; /*!< For tracking allocations */
+ bool visited; /*!< whether this node is
+ already processed */
+ /** current transaction */
+ const trx_t* trx;
+ /* Used by plugin parser */
+ fts_ast_node_t* up_node; /*!< Direct up node */
+ bool go_up; /*!< Flag if go one level up */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_t {
+ mem_heap_t* heap; /*!< Heap to use for alloc */
+ fts_ast_node_t* root; /*!< If all goes OK, then this
+ will point to the root.*/
+
+ fts_ast_list_t list; /*!< List of nodes allocated */
+
+ fts_lexer_t* lexer; /*!< Lexer callback + arg */
+ CHARSET_INFO* charset; /*!< charset used for
+ tokenization */
+ /* Used by plugin parser */
+ fts_ast_node_t* cur_node; /*!< Current node into which
+ we add new node */
+ int depth; /*!< Depth of parsing state */
+};
+
+/******************************************************************//**
+Create an AST term node, makes a copy of ptr for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term_for_parser(
+/*==========i=====================*/
+ void* arg, /*!< in: ast state */
+ const char* ptr, /*!< in: term string */
+ const ulint len); /*!< in: term string length */
+
+/******************************************************************//**
+Create an AST phrase list node for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_phrase_list(
+/*============================*/
+ void* arg); /*!< in: ast state */
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_node_type_get(fts_ast_type_t type);
+#endif /* UNIV_DEBUG */
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h
new file mode 100644
index 00000000..b16e7f2c
--- /dev/null
+++ b/storage/innobase/include/fts0blex.h
@@ -0,0 +1,702 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0b_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0b_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0b_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0b_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0b_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0b_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0b_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0b_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0b_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0b_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0b_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0b_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0b_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0b_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0b_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0b_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0b_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0b_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0bpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0bpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0bpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0bpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0bensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0bensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0blex_ALREADY_DEFINED
+#else
+#define yylex fts0blex
+#endif
+
+#ifdef yyrestart
+#define fts0brestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0brestart
+#endif
+
+#ifdef yylex_init
+#define fts0blex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0blex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0blex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0blex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0blex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0blex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0bget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0bget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0bset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0bset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0bget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0bget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0bset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0bset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0bget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0bget_in
+#endif
+
+#ifdef yyset_in
+#define fts0bset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0bset_in
+#endif
+
+#ifdef yyget_out
+#define fts0bget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0bget_out
+#endif
+
+#ifdef yyset_out
+#define fts0bset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0bset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0bget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0bget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0bget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0bget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0bget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0bget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0bset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0bset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0bget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0bget_column
+#endif
+
+#ifdef yyset_column
+#define fts0bset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0bset_column
+#endif
+
+#ifdef yywrap
+#define fts0bwrap_ALREADY_DEFINED
+#else
+#define yywrap fts0bwrap
+#endif
+
+#ifdef yyalloc
+#define fts0balloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0balloc
+#endif
+
+#ifdef yyrealloc
+#define fts0brealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0brealloc
+#endif
+
+#ifdef yyfree
+#define fts0bfree_ALREADY_DEFINED
+#else
+#define yyfree fts0bfree
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+ are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out ( FILE * _out_str , yyscan_t yyscanner );
+
+ int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0b_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0b_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0b_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0b_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0b_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0b_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0b_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0b_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0b_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0bpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0bpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0bensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0blex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0brestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0blex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0blex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0blex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0bget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0bset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0bget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0bset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0bget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0bset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0bget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0bset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0bget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0bget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0bget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0bset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0bget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0bset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0bwrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0bget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0bset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0bget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0bset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0balloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0brealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0bfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0btext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0bleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0bin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0bout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0b_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0blineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0btables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0btables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0bTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 74 "fts0blex.l"
+
+
+#line 701 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
new file mode 100644
index 00000000..c0151b44
--- /dev/null
+++ b/storage/innobase/include/fts0fts.h
@@ -0,0 +1,947 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#pragma once
+
+#include "data0type.h"
+#include "data0types.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID 0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME "FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN 16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN 8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT 3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL 1073741824
+
+/** Document id type. */
+typedef ib_id_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s) mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s) mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL 0
+#define FTS_BOOL 1
+#define FTS_SORTED 2
+#define FTS_EXPAND 4
+#define FTS_NO_RANKING 8
+#define FTS_PROXIMITY 16
+#define FTS_PHRASE 32
+#define FTS_OPT_RANKING 64
+
+#define FTS_INDEX_TABLE_IND_NAME "FTS_INDEX_TABLE_IND"
+
+/** The number of FTS index partitions for a fulltext idnex */
+#define FTS_NUM_AUX_INDEX 6
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD 10000000
+
+/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */
+#define FTS_MAX_WORD_LEN (HA_FT_MAXCHARLEN * 4)
+
+/** Maximum possible Fulltext word length (in characters) */
+#define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN
+
+/** Number of columns in FTS AUX Tables */
+#define FTS_DELETED_TABLE_NUM_COLS 1
+#define FTS_CONFIG_TABLE_NUM_COLS 2
+#define FTS_AUX_INDEX_TABLE_NUM_COLS 5
+
+/** DELETED_TABLE(doc_id BIGINT UNSIGNED) */
+#define FTS_DELETED_TABLE_COL_LEN 8
+/** CONFIG_TABLE(key CHAR(50), value CHAR(200)) */
+#define FTS_CONFIG_TABLE_KEY_COL_LEN 50
+#define FTS_CONFIG_TABLE_VALUE_COL_LEN 200
+
+#define FTS_INDEX_FIRST_DOC_ID_LEN 8
+#define FTS_INDEX_LAST_DOC_ID_LEN 8
+#define FTS_INDEX_DOC_COUNT_LEN 4
+/* BLOB COLUMN, 0 means VARIABLE SIZE */
+#define FTS_INDEX_ILIST_LEN 0
+
+
+/** Variable specifying the FTS parallel sort degree */
+extern ulong fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+enum fts_row_state {
+ FTS_INSERT = 0,
+ FTS_MODIFY,
+ FTS_DELETE,
+ FTS_NOTHING,
+ FTS_INVALID
+};
+
+/** The FTS table types. */
+enum fts_table_type_t {
+ FTS_INDEX_TABLE, /*!< FTS auxiliary table that is
+ specific to a particular FTS index
+ on a table */
+
+ FTS_COMMON_TABLE /*!< FTS auxiliary table that is common
+ for all FTS index on a table */
+};
+
+struct fts_doc_t;
+struct fts_cache_t;
+struct fts_token_t;
+struct fts_doc_ids_t;
+struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do { \
+ (fts_table)->suffix = m_suffix; \
+ (fts_table)->type = m_type; \
+ (fts_table)->table_id = m_table->id; \
+ (fts_table)->table = m_table; \
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do { \
+ (fts_table)->suffix = m_suffix; \
+ (fts_table)->type = m_type; \
+ (fts_table)->table_id = m_index->table->id; \
+ (fts_table)->table = m_index->table; \
+ (fts_table)->index_id = m_index->id; \
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_t {
+ trx_t* trx; /*!< InnoDB transaction */
+
+ ib_vector_t* savepoints; /*!< Active savepoints, must have at
+ least one element, the implied
+ savepoint */
+ ib_vector_t* last_stmt; /*!< last_stmt */
+
+ mem_heap_t* heap; /*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_t {
+ char* name; /*!< First entry is always NULL, the
+ default instance. Otherwise the name
+ of the savepoint */
+
+ ib_rbt_t* tables; /*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_t {
+ dict_table_t* table; /*!< table */
+
+ fts_trx_t* fts_trx; /*!< link to parent */
+
+ ib_rbt_t* rows; /*!< rows changed; indexed by doc-id,
+ cells are fts_trx_row_t* */
+
+ fts_doc_ids_t* added_doc_ids; /*!< list of added doc ids (NULL until
+ the first addition) */
+
+ /*!< for adding doc ids */
+ que_t* docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_t {
+ doc_id_t doc_id; /*!< Id of the ins/upd/del document */
+
+ fts_row_state state; /*!< state of the row */
+
+ ib_vector_t* fts_indexes; /*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_t {
+ ib_vector_t* doc_ids; /*!< document ids (each element is
+ of type doc_id_t). */
+
+ ib_alloc_t* self_heap; /*!< Allocator used to create an
+ instance of this type and the
+ doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_t {
+ byte* f_str; /*!< string, not necessary terminated in
+ any way */
+ ulint f_len; /*!< Length of the string in bytes */
+ ulint f_n_char; /*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_t {
+ doc_id_t doc_id; /*!< Document id */
+
+ fts_rank_t rank; /*!< Rank is between 0 .. 1 */
+
+ byte* words; /*!< this contains the words
+ that were queried
+ and found in this document */
+ ulint words_len; /*!< words len */
+};
+
+/** Query result. */
+struct fts_result_t {
+ ib_rbt_node_t* current; /*!< Current element */
+
+ ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t
+ indexed by doc id */
+ ib_rbt_t* rankings_by_rank;/*!< RB tree of type fts_ranking_t
+ indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_t {
+ fts_table_type_t
+ type; /*!< The auxiliary table type */
+
+ table_id_t table_id; /*!< The table id */
+
+ index_id_t index_id; /*!< The index id */
+
+ const char* suffix; /*!< The suffix of the fts auxiliary
+ table name, can be NULL, not used
+ everywhere (yet) */
+ const dict_table_t*
+ table; /*!< Parent table */
+ CHARSET_INFO* charset; /*!< charset info if it is for FTS
+ index auxiliary table */
+};
+
+/** The state of the FTS sub system. */
+class fts_t {
+public:
+ /** fts_t constructor.
+ @param[in] table table with FTS indexes
+ @param[in,out] heap memory heap where 'this' is stored */
+ fts_t(
+ const dict_table_t* table,
+ mem_heap_t* heap);
+
+ /** fts_t destructor. */
+ ~fts_t();
+
+ /** Whether the ADDED table record sync-ed after crash recovery */
+ unsigned added_synced:1;
+ /** Whether the table holds dict_sys.latch */
+ unsigned dict_locked:1;
+
+ /** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL
+ if the thread has not yet been created. Each work item is a
+ fts_trx_doc_ids_t*. */
+ ib_wqueue_t* add_wq;
+
+ /** FTS memory buffer for this table, or NULL if the table has no FTS
+ index. */
+ fts_cache_t* cache;
+
+ /** FTS doc id hidden column number in the CLUSTERED index. */
+ ulint doc_col;
+
+ /** Vector of FTS indexes, this is mainly for caching purposes. */
+ ib_vector_t* indexes;
+
+ /** Whether the table exists in fts_optimize_wq;
+ protected by fts_optimize_wq mutex */
+ bool in_queue;
+
+ /** Whether the sync message exists in fts_optimize_wq;
+ protected by fts_optimize_wq mutex */
+ bool sync_message;
+
+ /** Heap for fts_t allocation. */
+ mem_heap_t* fts_heap;
+};
+
+struct fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT 0x1
+#define STOPWORD_OFF 0x2
+#define STOPWORD_FROM_DEFAULT 0x4
+#define STOPWORD_USER_TABLE 0x8
+
+extern const char* fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern Atomic_relaxed<size_t> fts_max_cache_size;
+
+/** Variable specifying the total memory allocated for FTS cache */
+extern Atomic_relaxed<size_t> fts_max_total_cache_size;
+
+/** Variable specifying the FTS result cache limit for each query */
+extern size_t fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong fts_min_token_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+extern bool fts_need_sync;
+
+/******************************************************************//**
+Create a FTS cache. */
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+ dict_table_t* table); /*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+ dict_table_t* table, /*!< in: table with FTS index */
+ dict_index_t* index); /*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id.
+@return DB_SUCCESS if OK */
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t* doc_id);/*!< out: new document id */
+
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/** Free fts_doc_ids_t */
+inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids)
+{
+ mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg));
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+void
+fts_trx_add_op(
+/*===========*/
+ trx_t* trx, /*!< in: InnoDB transaction */
+ dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id, /*!< in: doc id */
+ fts_row_state state, /*!< in: state of the row */
+ ib_vector_t* fts_indexes); /*!< in: FTS indexes affected
+ (NULL=all) */
+
+/******************************************************************//**
+Free an FTS trx. */
+void
+fts_trx_free(
+/*=========*/
+ fts_trx_t* fts_trx); /*!< in, own: FTS trx */
+
+/** Creates the common auxiliary tables needed for supporting an FTS index
+on the given table.
+The following tables are created.
+CREATE TABLE $FTS_PREFIX_DELETED
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_DELETED_CACHE
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_CONFIG
+ (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key)
+@param[in,out] trx transaction
+@param[in] table table with FTS index
+@param[in] skip_doc_id_index Skip index on doc id
+@return DB_SUCCESS if succeed */
+dberr_t
+fts_create_common_tables(
+ trx_t* trx,
+ dict_table_t* table,
+ bool skip_doc_id_index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table.
+
+All FTS AUX Index tables have the following schema.
+CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
+ word VARCHAR(FTS_MAX_WORD_LEN),
+ first_doc_id INT NOT NULL,
+ last_doc_id UNSIGNED NOT NULL,
+ doc_count UNSIGNED INT NOT NULL,
+ ilist VARBINARY NOT NULL,
+ UNIQUE CLUSTERED INDEX ON (word, first_doc_id))
+@param[in,out] trx dictionary transaction
+@param[in] index fulltext index
+@param[in] id table id
+@return DB_SUCCESS or error code */
+dberr_t
+fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the FTS document id hidden column. */
+void
+fts_add_doc_id_column(
+/*==================*/
+ dict_table_t* table, /*!< in/out: Table with FTS index */
+ mem_heap_t* heap); /*!< in: temporary memory heap, or NULL */
+
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index);
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx transaction
+@param table table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table);
+
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx transaction
+@param table table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table);
+
+/** Drop the internal FTS_ tables for table.
+@param trx transaction
+@param table table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table);
+
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_commit(
+/*=======*/
+ trx_t* trx) /*!< in: transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** FTS Query entry point.
+@param[in,out] trx transaction
+@param[in] index fts index to search
+@param[in] flags FTS search mode
+@param[in] query_str FTS query
+@param[in] query_len FTS query string len in bytes
+@param[in,out] result result doc ids
+@return DB_SUCCESS if successful otherwise error code */
+dberr_t
+fts_query(
+ trx_t* trx,
+ dict_index_t* index,
+ uint flags,
+ const byte* query_str,
+ ulint query_len,
+ fts_result_t** result)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+float
+fts_retrieve_ranking(
+/*=================*/
+ fts_result_t* result, /*!< in: FTS result structure */
+ doc_id_t doc_id); /*!< in: the interested document
+ doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+ fts_result_t* result); /*!< out: result instance
+ to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+void
+fts_query_free_result(
+/*==================*/
+ fts_result_t* result); /*!< in: result instance
+ to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ dtuple_t* row); /*!< in: row whose FTS doc id we
+ want to extract.*/
+
+/** Extract the doc id from the record that belongs to index.
+@param[in] rec record containing FTS_DOC_ID
+@param[in] index index of rec
+@param[in] offsets rec_get_offsets(rec,index)
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_rec(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets);
+
+/** Add new fts doc id to the update vector.
+@param[in] table the table that contains the FTS index.
+@param[in,out] ufield the fts doc id field in the update vector.
+ No new memory is allocated for this in this
+ function.
+@param[in,out] next_doc_id the fts doc id that has been added to the
+ update vector. If 0, a new fts doc id is
+ automatically generated. The memory provided
+ for this argument will be used by the update
+ vector. Ensure that the life time of this
+ memory matches that of the update vector.
+@return the fts doc id used in the update vector */
+doc_id_t
+fts_update_doc_id(
+ dict_table_t* table,
+ upd_field_t* ufield,
+ doc_id_t* next_doc_id);
+
+/******************************************************************//**
+FTS initialize. */
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+fts_t*
+fts_create(
+/*=======*/
+ dict_table_t* table); /*!< out: table with FTS
+ indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+dberr_t
+fts_optimize_table(
+/*===============*/
+ dict_table_t* table); /*!< in: table to optimiza */
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+void
+fts_optimize_init(void);
+/*====================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the table to add to the OPTIMIZER's list.
+@param[in] table table to add */
+void
+fts_optimize_add_table(
+ dict_table_t* table);
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+void
+fts_optimize_remove_table(
+/*======================*/
+ dict_table_t* table); /*!< in: table to remove */
+
+/** Shutdown fts optimize thread. */
+void
+fts_optimize_shutdown();
+
+/** Send sync fts cache for the table.
+@param[in] table table to sync */
+void
+fts_optimize_request_sync_table(
+ dict_table_t* table);
+
+/**********************************************************************//**
+Take a FTS savepoint. */
+void
+fts_savepoint_take(
+/*===============*/
+ fts_trx_t* fts_trx, /*!< in: fts transaction */
+ const char* name); /*!< in: savepoint name */
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+ trx_t* trx); /*!< in: transaction */
+
+/**********************************************************************//**
+Release the savepoint data identified by name. */
+void
+fts_savepoint_release(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name); /*!< in: savepoint name */
+
+/** Clear cache.
+@param[in,out] cache fts cache */
+void
+fts_cache_clear(
+ fts_cache_t* cache);
+
+/*********************************************************************//**
+Initialize things in cache. */
+void
+fts_cache_init(
+/*===========*/
+ fts_cache_t* cache); /*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name); /*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+ trx_t* trx); /*!< in: transaction */
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out] table fts table
+@param[in] wait whether to wait for existing sync to finish
+@return DB_SUCCESS on success, error code on failure. */
+dberr_t fts_sync_table(dict_table_t* table, bool wait = true);
+
+/****************************************************************//**
+Create an FTS index cache. */
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+ dict_index_t* index); /*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+doc_id_t
+fts_init_doc_id(
+/*============*/
+ const dict_table_t* table); /*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+ CHARSET_INFO* cs, /*!< in: Character set */
+ char* src, /*!< in: string to put in
+ lower case */
+ size_t src_len, /*!< in: input string length */
+ char* dst, /*!< in: buffer for result
+ string */
+ size_t dst_len); /*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+ CHARSET_INFO* charset, /*!< in: Character set */
+ const byte* start, /*!< in: start of text */
+ const byte* end, /*!< in: one character past
+ end of text */
+ fts_string_t* token); /*!< out: token's text */
+
+/*************************************************************//**
+Get token char size by charset
+@return the number of token char size */
+ulint
+fts_get_token_size(
+/*===============*/
+ const CHARSET_INFO* cs, /*!< in: Character set */
+ const char* token, /*!< in: token */
+ ulint len); /*!< in: token length */
+
+/*************************************************************//**
+FULLTEXT tokenizer internal in MYSQL_FTPARSER_SIMPLE_MODE
+@return 0 if tokenize sucessfully */
+int
+fts_tokenize_document_internal(
+/*===========================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser parameter */
+ const char* doc, /*!< in: document to tokenize */
+ int len); /*!< in: document length */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+ulint
+fts_get_rows_count(
+/*===============*/
+ fts_table_t* fts_table); /*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+ dict_table_t* table); /*!< in: user table */
+
+/** Check whether a stopword table is in the right format.
+@param stopword_table_name table name
+@param row_end name of the system-versioning end column, or "value"
+@return the stopword column charset
+@retval NULL if the table does not exist or qualify */
+CHARSET_INFO *fts_valid_stopword_table(const char *stopword_table_name,
+ const char **row_end= NULL);
+
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return true if success */
+bool
+fts_load_stopword(
+/*==============*/
+ const dict_table_t*
+ table, /*!< in: Table with FTS */
+ trx_t* trx, /*!< in: Transaction */
+ const char* session_stopword_table, /*!< in: Session stopword table
+ name */
+ bool stopword_is_on, /*!< in: Whether stopword
+ option is turned on/off */
+ bool reload); /*!< in: Whether it is during
+ reload of FTS table */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return DB_SUCCESS if OK */
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: aux table */
+ fts_doc_ids_t* doc_ids); /*!< in: For collecting
+ doc ids */
+/****************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations */
+void
+fts_init_index(
+/*===========*/
+ dict_table_t* table, /*!< in: Table with FTS */
+ bool has_cache_lock); /*!< in: Whether we already
+ have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+void
+fts_add_index(
+/*==========*/
+ dict_index_t* index, /*!< FTS index to be added */
+ dict_table_t* table); /*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+dberr_t
+fts_drop_index(
+/*===========*/
+ dict_table_t* table, /*!< in: Table where indexes are dropped */
+ dict_index_t* index, /*!< in: Index to be dropped */
+ trx_t* trx); /*!< in: Transaction for the drop */
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table
+@return DB_SUCCESS or error code */
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+ dict_table_t* table, /*!< in: user Table */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx); /*!< in: transaction */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+ibool
+fts_check_cached_index(
+/*===================*/
+ dict_table_t* table); /*!< in: Table where indexes are dropped */
+
+/** Fetch the document from tuple, tokenize the text data and
+insert the text data into fts auxiliary table and
+its cache. Moreover this tuple fields doesn't contain any information
+about externally stored field. This tuple contains data directly
+converted from mysql.
+@param[in] ftt FTS transaction table
+@param[in] doc_id doc id
+@param[in] tuple tuple from where data can be retrieved
+ and tuple should be arranged in table
+ schema order. */
+void
+fts_add_doc_from_tuple(
+ fts_trx_table_t*ftt,
+ doc_id_t doc_id,
+ const dtuple_t* tuple);
+
+/** Create an FTS trx.
+@param[in,out] trx InnoDB Transaction
+@return FTS transaction. */
+fts_trx_t*
+fts_trx_create(
+ trx_t* trx);
+
+/** Clear all fts resources when there is no internal DOC_ID
+and there are no new fts index to add.
+@param[in,out] table table where fts is to be freed */
+void fts_clear_all(dict_table_t *table);
+
+/** Check whether the given name is fts auxiliary table
+and fetch the parent table id and index id
+@param[in] name table name
+@param[in,out] table_id parent table id
+@param[in,out] index_id index id
+@return true if it is auxilary table */
+bool fts_check_aux_table(const char *name,
+ table_id_t *table_id,
+ index_id_t *index_id);
+
+/** Update the last document id. This function could create a new
+transaction to update the last document id.
+@param table table to be updated
+@param doc_id last document id
+@param trx update trx or null
+@retval DB_SUCCESS if OK */
+dberr_t
+fts_update_sync_doc_id(const dict_table_t *table,
+ doc_id_t doc_id,
+ trx_t *trx)
+ MY_ATTRIBUTE((nonnull(1)));
+
+/** Sync the table during commit phase
+@param[in] table table to be synced */
+void fts_sync_during_ddl(dict_table_t* table);
diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h
new file mode 100644
index 00000000..c527ad8e
--- /dev/null
+++ b/storage/innobase/include/fts0opt.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/** The FTS optimize thread's work queue. */
+extern ib_wqueue_t* fts_optimize_wq;
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+ /* out: always returns non-NULL */
+ void* row, /* in: sel_node_t* */
+ void* user_arg); /* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h
new file mode 100644
index 00000000..8108e811
--- /dev/null
+++ b/storage/innobase/include/fts0pars.h
@@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 2.5. */
+
+/* Bison interface for Yacc-like parsers in C
+
+ Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* As a special exception, you may create a larger work that contains
+ part or all of the Bison parser skeleton and distribute that work
+ under terms of your choice, so long as that work isn't itself a
+ parser generator using the skeleton or a modified version thereof
+ as a parser skeleton. Alternatively, if you modify or redistribute
+ the parser skeleton itself, you may (at your option) remove this
+ special exception, which will cause the skeleton and the resulting
+ Bison output files to be licensed under the GNU General Public
+ License without this special exception.
+
+ This special exception was added by the Free Software Foundation in
+ version 2.2 of Bison. */
+
+
+/* Tokens. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ /* Put the tokens into the symbol table, so that GDB and other debuggers
+ know about them. */
+ enum yytokentype {
+ FTS_OPER = 258,
+ FTS_TEXT = 259,
+ FTS_TERM = 260,
+ FTS_NUMB = 261
+ };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 2068 of yacc.c */
+#line 61 "fts0pars.y"
+
+ int oper;
+ fts_ast_string_t* token;
+ fts_ast_node_t* node;
+
+
+
+/* Line 2068 of yacc.c */
+#line 64 "fts0pars.hh"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/innobase/include/fts0plugin.h b/storage/innobase/include/fts0plugin.h
new file mode 100644
index 00000000..18ec2d6d
--- /dev/null
+++ b/storage/innobase/include/fts0plugin.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0plugin.h
+Full text search plugin header file
+
+Created 2013/06/04 Shaohua Wang
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PLUGIN_H
+#define INNOBASE_FTS0PLUGIN_H
+
+#include "univ.i"
+
+extern struct st_mysql_ftparser fts_default_parser;
+
+struct fts_ast_state_t;
+
+#define PARSER_INIT(parser, arg) if (parser->init) { parser->init(arg); }
+#define PARSER_DEINIT(parser, arg) if (parser->deinit) { parser->deinit(arg); }
+
+/******************************************************************//**
+fts parse query by plugin parser.
+@return 0 if parse successfully, or return non-zero. */
+int
+fts_parse_by_parser(
+/*================*/
+ ibool mode, /*!< in: query boolean mode */
+ uchar* query, /*!< in: query string */
+ ulint len, /*!< in: query string length */
+ st_mysql_ftparser* parse, /*!< in: fts plugin parser */
+ fts_ast_state_t* state); /*!< in: query parser state */
+
+#endif /* INNOBASE_FTS0PLUGIN_H */
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
new file mode 100644
index 00000000..ae0bb036
--- /dev/null
+++ b/storage/innobase/include/fts0priv.h
@@ -0,0 +1,485 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+ /* !<This must be 0 since we insert
+ a hard coded '0' at create time
+ to the config table */
+
+ FTS_TABLE_STATE_RUNNING = 0, /*!< Auxiliary tables created OK */
+
+ FTS_TABLE_STATE_OPTIMIZING, /*!< This is a substate of RUNNING */
+
+ FTS_TABLE_STATE_DELETED /*!< All aux tables to be dropped when
+ it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT 10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT 1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN 64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN 1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE (64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS "optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID "synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD "last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT "deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT "total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME "optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME "optimize_end_time"
+
+/** User specified stopword table name */
+#define FTS_STOPWORD_TABLE_NAME "stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define FTS_USE_STOPWORD "use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE "table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+ FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH 48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN 32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+que_t*
+fts_parse_sql(
+/*==========*/
+ fts_table_t* fts_table, /*!< in: FTS aux table */
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql) /*!< in: SQL string to evaluate */
+ MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+dberr_t
+fts_eval_sql(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ que_t* graph) /*!< in: Parsed statement */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in] fts_table metadata on fulltext-indexed table
+@param[out] table_name a name up to MAX_FULL_NAME_LEN
+@param[in] dict_locked whether dict_sys.latch is being held */
+void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
+ bool dict_locked = false)
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+ dict_index_t* index, /*!< in: FTS index */
+ pars_info_t* info, /*!< in/out: parser info */
+ mem_heap_t* heap) /*!< in: memory heap */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define FTS_FETCH_DOC_BY_ID_EQUAL 1
+#define FTS_FETCH_DOC_BY_ID_LARGE 2
+#define FTS_FETCH_DOC_BY_ID_SMALL 3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+ fts_get_doc_t* get_doc, /*!< in: state */
+ doc_id_t doc_id, /*!< in: id of document to fetch */
+ dict_index_t* index_to_use, /*!< in: caller supplied FTS index,
+ or NULL */
+ ulint option, /*!< in: search option, if it is
+ greater than doc_id or equal */
+ fts_sql_callback
+ callback, /*!< in: callback to read
+ records */
+ void* arg) /*!< in: callback arg */
+ MY_ATTRIBUTE((nonnull(6)));
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts_doc_t* */
+ MY_ATTRIBUTE((nonnull));
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+dberr_t
+fts_write_node(
+/*===========*/
+ trx_t* trx, /*!< in: transaction */
+ que_t** graph, /*!< in: query graph */
+ fts_table_t* fts_table, /*!< in: the FTS aux index */
+ fts_string_t* word, /*!< in: word in UTF-8 */
+ fts_node_t* node) /*!< in: node columns */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a fts token is a stopword or less than fts_min_token_size
+or greater than fts_max_token_size.
+@param[in] token token string
+@param[in] stopwords stopwords rb tree
+@param[in] cs token charset
+@retval true if it is not stopword and length in range
+@retval false if it is stopword or length not in range */
+bool
+fts_check_token(
+ const fts_string_t* token,
+ const ib_rbt_t* stopwords,
+ const CHARSET_INFO* cs);
+
+/******************************************************************//**
+Initialize a document. */
+void
+fts_doc_init(
+/*=========*/
+ fts_doc_t* doc) /*!< in: doc to initialize */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+ inserted if not found */
+int
+fts_bsearch(
+/*========*/
+ doc_id_t* array, /*!< in: array to sort */
+ int lower, /*!< in: lower bound of array*/
+ int upper, /*!< in: upper bound of array*/
+ doc_id_t doc_id) /*!< in: doc id to lookup */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Free document. */
+void
+fts_doc_free(
+/*=========*/
+ fts_doc_t* doc) /*!< in: document */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+void
+fts_word_free(
+/*==========*/
+ fts_word_t* word) /*!< in: instance to free.*/
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Read the rows from the FTS inde
+@return DB_SUCCESS or error code */
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ que_t** graph, /*!< in: prepared statement */
+ fts_table_t* fts_table, /*!< in: FTS aux table */
+ const fts_string_t*
+ word, /*!< in: the word to fetch */
+ fts_fetch_t* fetch) /*!< in: fetch callback.*/
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+ const void* v1, /*!< in: id1 */
+ const void* v2) /*!< in: id2 */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
+#define fts_sql_rollback(trx) (trx)->rollback()
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_value(
+/*=================*/
+ trx_t* trx, /* transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+ trx_t* trx, /*!< transaction */
+ dict_index_t* index, /*!< in: index */
+ const char* param, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_value(
+/*=================*/
+ trx_t* trx, /*!< transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: get config value for
+ this parameter name */
+ const fts_string_t*
+ value) /*!< in: value to update */
+ MY_ATTRIBUTE((nonnull));
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: param name */
+ ulint int_value) /*!< in: value */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+ trx_t* trx, /*!< transaction */
+ dict_index_t* index, /*!< in: index */
+ const char* param, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ const char* name, /*!< in: param name */
+ ulint* int_value) /*!< out: value */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* FTS_OPTIMIZE_DEBUG */
+
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ const char* name, /*!< in: param name */
+ ulint int_value) /*!< in: value */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: param name */
+ ulint* int_value) /*!< out: value */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+ const fts_index_cache_t*
+ index_cache, /*!< in: cache to search */
+ const fts_string_t*
+ text) /*!< in: word to search for */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+ fts_cache_t* cache, /*!< in: cache to use */
+ ib_vector_t* vector); /*!< in: append to this vector */
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+ const fts_cache_t*
+ cache, /*!< in: cache to search */
+ const dict_index_t*
+ index) /*!< in: index to search for */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+ ib_id_t id, /*!< in: a table/index id */
+ char* str); /*!< in: buffer to write the id to */
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+ ib_id_t* id, /*!< out: a table id */
+ const char* str) /*!< in: buffer to read from */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+int
+fts_get_table_id(
+/*=============*/
+ const fts_table_t*
+ fts_table, /*!< in: FTS Auxiliary table */
+ char* table_id) /*!< out: table id, must be at least
+ FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+ long */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add node positions. */
+void
+fts_cache_node_add_positions(
+/*=========================*/
+ fts_cache_t* cache, /*!< in: cache */
+ fts_node_t* node, /*!< in: word node */
+ doc_id_t doc_id, /*!< in: doc id */
+ ib_vector_t* positions) /*!< in: fts_token_t::positions */
+ MY_ATTRIBUTE((nonnull(2,4)));
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+ const char* param, /*!< in: base name of param */
+ const dict_index_t* index) /*!< in: index for config */
+ MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+
+#include "fts0priv.inl"
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl
new file mode 100644
index 00000000..da14cfcb
--- /dev/null
+++ b/storage/innobase/include/fts0priv.inl
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+ ib_id_t id, /* in: a table/index id */
+ char* str) /* in: buffer to write the id to */
+{
+
+#ifdef _WIN32
+
+ DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name",
+ return(sprintf(str, UINT64PFx, id)););
+
+ /* Use this to construct old(5.6.14 and 5.7.3) windows
+ ambiguous aux table names */
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ return(sprintf(str, "%016llu", (ulonglong) id)););
+
+#else /* _WIN32 */
+
+ /* Use this to construct old(5.6.14 and 5.7.3) windows
+ ambiguous aux table names */
+ DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name",
+ return(sprintf(str, "%016llu", (ulonglong) id)););
+
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ return(sprintf(str, "%016llx", (ulonglong) id)););
+
+#endif /* _WIN32 */
+
+ return(sprintf(str, "%016llx", (ulonglong) id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+ ib_id_t* id, /* out: an id */
+ const char* str) /* in: buffer to read from */
+{
+ /* NOTE: this func doesn't care about whether current table
+ is set with HEX_NAME, the user of the id read here will check
+ if the id is HEX or DEC and do the right thing with it. */
+ return(sscanf(str, UINT64PFx, id) == 1);
+}
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const dict_table_t* table1
+ = (*static_cast<const fts_trx_table_t* const*>(p1))->table;
+
+ const dict_table_t* table2
+ = (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+ return((table1->id > table2->id)
+ ? 1
+ : (table1->id == table2->id)
+ ? 0
+ : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const uintmax_t* table_id = static_cast<const uintmax_t*>(p1);
+ const dict_table_t* table2
+ = (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+ return((*table_id > table2->id)
+ ? 1
+ : (*table_id == table2->id)
+ ? 0
+ : -1);
+}
diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h
new file mode 100644
index 00000000..89655ca1
--- /dev/null
+++ b/storage/innobase/include/fts0tlex.h
@@ -0,0 +1,702 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0t_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0t_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0t_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0t_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0t_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0t_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0t_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0t_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0t_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0t_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0t_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0t_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0t_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0t_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0t_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0t_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0t_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0t_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0tpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0tpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0tpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0tpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0tensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0tensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0tlex_ALREADY_DEFINED
+#else
+#define yylex fts0tlex
+#endif
+
+#ifdef yyrestart
+#define fts0trestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0trestart
+#endif
+
+#ifdef yylex_init
+#define fts0tlex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0tlex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0tlex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0tlex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0tlex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0tlex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0tget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0tget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0tset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0tset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0tget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0tget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0tset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0tset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0tget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0tget_in
+#endif
+
+#ifdef yyset_in
+#define fts0tset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0tset_in
+#endif
+
+#ifdef yyget_out
+#define fts0tget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0tget_out
+#endif
+
+#ifdef yyset_out
+#define fts0tset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0tset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0tget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0tget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0tget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0tget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0tget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0tget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0tset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0tset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0tget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0tget_column
+#endif
+
+#ifdef yyset_column
+#define fts0tset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0tset_column
+#endif
+
+#ifdef yywrap
+#define fts0twrap_ALREADY_DEFINED
+#else
+#define yywrap fts0twrap
+#endif
+
+#ifdef yyalloc
+#define fts0talloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0talloc
+#endif
+
+#ifdef yyrealloc
+#define fts0trealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0trealloc
+#endif
+
+#ifdef yyfree
+#define fts0tfree_ALREADY_DEFINED
+#else
+#define yyfree fts0tfree
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+ are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out ( FILE * _out_str , yyscan_t yyscanner );
+
+ int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0t_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0t_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0t_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0t_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0t_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0t_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0t_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0t_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0t_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0tpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0tpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0tensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0tlex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0trestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0tlex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0tlex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0tlex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0tget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0tset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0tget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0tset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0tget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0tset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0tget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0tset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0tget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0tget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0tget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0tset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0tget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0tset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0twrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0tget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0tset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0tget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0tset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0talloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0trealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0tfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0ttext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0tleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0tin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0tout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0t_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0tlineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0ttables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0ttables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0tTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 69 "fts0tlex.l"
+
+
+#line 701 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
new file mode 100644
index 00000000..1cddaf5b
--- /dev/null
+++ b/storage/innobase/include/fts0tokenize.h
@@ -0,0 +1,189 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0tokenize.cc
+Full Text Search plugin tokenizer refer to MyISAM
+
+Created 2014/11/17 Shaohua Wang
+***********************************************************************/
+
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+#include "m_ctype.h"
+
+/* Macros and structs below are from ftdefs.h in MyISAM */
+/** Check a char is true word */
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+/** Check if a char is misc word */
+#define misc_word_char(X) 0
+
+/** Boolean search syntax */
+static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
+
+#define FTB_YES (fts_boolean_syntax[0])
+#define FTB_EGAL (fts_boolean_syntax[1])
+#define FTB_NO (fts_boolean_syntax[2])
+#define FTB_INC (fts_boolean_syntax[3])
+#define FTB_DEC (fts_boolean_syntax[4])
+#define FTB_LBR (fts_boolean_syntax[5])
+#define FTB_RBR (fts_boolean_syntax[6])
+#define FTB_NEG (fts_boolean_syntax[7])
+#define FTB_TRUNC (fts_boolean_syntax[8])
+#define FTB_LQUOT (fts_boolean_syntax[10])
+#define FTB_RQUOT (fts_boolean_syntax[11])
+
+/** FTS query token */
+typedef struct st_ft_word {
+ uchar* pos; /*!< word start pointer */
+ uint len; /*!< word len */
+ double weight; /*!< word weight, unused in innodb */
+} FT_WORD;
+
+/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
+Differences: a. code format changed; b. stopword processing removed.
+@param[in] cs charset
+@param[in,out] start doc start pointer
+@param[in,out] end doc end pointer
+@param[in,out] word token
+@param[in,out] info token info
+@retval 0 eof
+@retval 1 word found
+@retval 2 left bracket
+@retval 3 right bracket
+@retval 4 stopword found */
+inline
+uchar
+fts_get_word(
+ const CHARSET_INFO* cs,
+ uchar** start,
+ uchar* end,
+ FT_WORD* word,
+ MYSQL_FTPARSER_BOOLEAN_INFO*
+ info)
+{
+ uchar* doc = *start;
+ int ctype;
+ uint mwc;
+ uint length;
+ int mbl;
+
+ info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
+ info->weight_adjust = info->wasign = 0;
+ info->type = FT_TOKEN_EOF;
+
+ while (doc < end) {
+ for (; doc < end;
+ doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+ mbl = cs->ctype(&ctype, doc, end);
+
+ if (true_word_char(ctype, *doc)) {
+ break;
+ }
+
+ if (*doc == FTB_RQUOT && info->quot) {
+ *start = doc + 1;
+ info->type = FT_TOKEN_RIGHT_PAREN;
+
+ return(info->type);
+ }
+
+ if (!info->quot) {
+ if (*doc == FTB_LBR
+ || *doc == FTB_RBR
+ || *doc == FTB_LQUOT) {
+ /* param->prev=' '; */
+ *start = doc + 1;
+ if (*doc == FTB_LQUOT) {
+ info->quot = (char*)1;
+ }
+
+ info->type = (*doc == FTB_RBR ?
+ FT_TOKEN_RIGHT_PAREN :
+ FT_TOKEN_LEFT_PAREN);
+
+ return(info->type);
+ }
+
+ if (info->prev == ' ') {
+ if (*doc == FTB_YES) {
+ info->yesno = +1;
+ continue;
+ } else if (*doc == FTB_EGAL) {
+ info->yesno = 0;
+ continue;
+ } else if (*doc == FTB_NO) {
+ info->yesno = -1;
+ continue;
+ } else if (*doc == FTB_INC) {
+ info->weight_adjust++;
+ continue;
+ } else if (*doc == FTB_DEC) {
+ info->weight_adjust--;
+ continue;
+ } else if (*doc == FTB_NEG) {
+ info->wasign = !info->wasign;
+ continue;
+ }
+ }
+ }
+
+ info->prev = char(*doc);
+ info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
+ info->weight_adjust = info->wasign = 0;
+ }
+
+ mwc = length = 0;
+ for (word->pos = doc;
+ doc < end;
+ length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+ mbl = cs->ctype(&ctype, doc, end);
+
+ if (true_word_char(ctype, *doc)) {
+ mwc = 0;
+ } else if (!misc_word_char(*doc) || mwc) {
+ break;
+ } else {
+ mwc++;
+ }
+ }
+
+ /* Be sure *prev is true_word_char. */
+ info->prev = 'A';
+ word->len = (uint)(doc-word->pos) - mwc;
+
+ if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
+ doc++;
+ }
+
+ /* We don't check stopword here. */
+ *start = doc;
+ info->type = FT_TOKEN_WORD;
+
+ return(info->type);
+ }
+
+ if (info->quot) {
+ *start = doc;
+ info->type = FT_TOKEN_RIGHT_PAREN;
+ }
+
+ return(info->type);
+}
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000..fb278d54
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,354 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "fts0fts.h"
+#include "pars0pars.h"
+#include "que0types.h"
+#include "ut0byte.h"
+#include "ut0rbt.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+ doc_id_t doc_id; /*!< Document id */
+ ulint word_count; /*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+ fts_index_cache_t*
+ index_cache; /*!< The index cache instance */
+
+ /*!< Parsed sql statement */
+ que_t* get_document_graph;
+ fts_cache_t* cache; /*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+ dict_index_t* index; /*!< The FTS index instance */
+
+ ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*,
+ cells are fts_tokenizer_word_t*.*/
+
+ ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t
+ contained in the memory buffer.
+ Must be in sorted order (ascending).
+ The ideal choice is an rb tree but
+ the rb tree imposes a space overhead
+ that we can do without */
+
+ que_t** ins_graph; /*!< Insert query graphs */
+
+ que_t** sel_graph; /*!< Select query graphs */
+ CHARSET_INFO* charset; /*!< charset */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+ ulint status; /*!< Status of the stopword tree */
+ ib_alloc_t* heap; /*!< The memory allocator to use */
+ ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */
+ CHARSET_INFO* charset; /*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+ trx_t* trx; /*!< The transaction used for SYNCing
+ the cache to disk */
+ dict_table_t* table; /*!< Table with FTS index(es) */
+ ulint max_cache_size; /*!< Max size in bytes of the cache */
+ ibool cache_full; /*!< flag, when true it indicates that
+ we need to sync the cache to disk */
+ ulint lower_index; /*!< the start index of the doc id
+ vector from where to start adding
+ documents to the FTS cache */
+ ulint upper_index; /*!< max index of the doc id vector to
+ add to the FTS cache */
+ ibool interrupted; /*!< TRUE if SYNC was interrupted */
+ doc_id_t min_doc_id; /*!< The smallest doc id added to the
+ cache. It should equal to
+ doc_ids[lower_index] */
+ doc_id_t max_doc_id; /*!< The doc id at which the cache was
+ noted as being full, we use this to
+ set the upper_limit field */
+ time_t start_time; /*!< SYNC start time; only used if
+ fts_enable_diag_print */
+ bool in_progress; /*!< flag whether sync is in progress.*/
+ bool unlock_cache; /*!< flag whether unlock cache when
+ write fts node */
+ /** condition variable for in_progress; used with table->fts->cache->lock */
+ pthread_cond_t cond;
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t
+{
+ /** lock protecting all access to the memory buffer */
+ mysql_mutex_t lock;
+ /** cache initialization */
+ mysql_mutex_t init_lock;
+
+ /** protection for deleted_doc_ids */
+ mysql_mutex_t deleted_lock;
+
+ /** protection for DOC_ID */
+ mysql_mutex_t doc_id_lock;
+
+ ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each
+ element is of type fts_update_t */
+
+ ib_vector_t* indexes; /*!< We store the stats and inverted
+ index for the individual FTS indexes
+ in this vector. Each element is
+ an instance of fts_index_cache_t */
+
+ ib_vector_t* get_docs; /*!< information required to read
+ the document from the table. Each
+ element is of type fts_doc_t */
+
+ size_t total_size; /*!< total size consumed by the ilist
+ field of all nodes. SYNC is run
+ whenever this gets too big */
+ /** total_size at the time of the previous SYNC request */
+ size_t total_size_at_sync;
+
+ fts_sync_t* sync; /*!< sync structure to sync data to
+ disk */
+ ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes
+ and deleted_doc_ids, ie. transient
+ objects, they are recreated after
+ a SYNC is completed */
+
+ ib_alloc_t* self_heap; /*!< This heap is the heap out of
+ which an instance of the cache itself
+ was created. Objects created using
+ this heap will last for the lifetime
+ of the cache */
+
+ doc_id_t next_doc_id; /*!< Next doc id */
+
+ doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
+
+ doc_id_t first_doc_id; /*!< first doc id since this table
+ was opened */
+
+ ulint deleted; /*!< Number of doc ids deleted since
+ last optimized. This variable is
+ covered by deleted_lock */
+
+ ulint added; /*!< Number of doc ids added since last
+ optimized. This variable is covered by
+ the deleted lock */
+
+ fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
+ mem_heap_t* cache_heap; /*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+ doc_id_t first_doc_id; /*!< First document id in ilist. */
+
+ doc_id_t last_doc_id; /*!< Last document id in ilist. */
+
+ byte* ilist; /*!< Binary list of documents & word
+ positions the token appears in.
+ TODO: For now, these are simply
+ ut_malloc'd, but if testing shows
+ that they waste memory unacceptably, a
+ special memory allocator will have
+ to be written */
+
+ ulint doc_count; /*!< Number of doc ids in ilist */
+
+ ulint ilist_size; /*!< Used size of ilist in bytes. */
+
+ ulint ilist_size_alloc;
+ /*!< Allocated size of ilist in
+ bytes */
+ bool synced; /*!< flag whether the node is synced */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+ fts_string_t text; /*!< Token text. */
+
+ ib_vector_t* nodes; /*!< Word node ilists, each element is
+ of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+ fts_string_t text; /*!< Word value in UTF-8 */
+ ib_vector_t* nodes; /*!< Nodes read from disk */
+
+ ib_alloc_t* heap_alloc; /*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+ void* read_arg; /*!< Arg for the sql_callback */
+
+ fts_sql_callback
+ read_record; /*!< Callback for reading index
+ record */
+ size_t total_memory; /*!< Total memory used */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+ ulint value; /*!< Character value at which
+ to split */
+
+ const char* suffix; /*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+ fts_string_t text; /*!< document text */
+
+ ibool found; /*!< TRUE if the document was found
+ successfully in the database */
+
+ ib_rbt_t* tokens; /*!< This is filled when the document
+ is tokenized. Tokens; indexed by
+ fts_string_t*, cells are of type
+ fts_token_t* */
+
+ ib_alloc_t* self_heap; /*!< An instance of this type is
+ allocated from this heap along
+ with any objects that have the
+ same lifespan, most notably
+ the vector of token positions */
+ CHARSET_INFO* charset; /*!< Document's charset info */
+
+ st_mysql_ftparser* parser; /*!< fts plugin parser */
+
+ ib_rbt_t* stopwords; /*!< Stopwords */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+ fts_string_t text; /*!< token text */
+
+ ib_vector_t* positions; /*!< an array of the positions the
+ token is found in; each item is
+ actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two doc_ids. */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Duplicate a string. */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ fts_string_t* dst, /*!< in: dup to here */
+ const fts_string_t* src, /*!< in: src string */
+ mem_heap_t* heap); /*!< in: heap to use */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+ ulint selected); /*!< in: selected index */
+
+/** Select the FTS auxiliary index for the given character.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length in bytes
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len);
+
+#include "fts0types.inl"
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl
new file mode 100644
index 00000000..facc1e5c
--- /dev/null
+++ b/storage/innobase/include/fts0types.inl
@@ -0,0 +1,231 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+/******************************************************************//**
+Duplicate a string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+ fts_string_t* dst, /*!< in: dup to here */
+ const fts_string_t* src, /*!< in: src string */
+ mem_heap_t* heap) /*!< in: heap to use */
+{
+ dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
+ memcpy(dst->f_str, src->f_str, src->f_len);
+
+ dst->f_len = src->f_len;
+ dst->f_str[src->f_len] = 0;
+ dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1;
+ const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2;
+
+ return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const fts_ranking_t* rk1 = (const fts_ranking_t*) p1;
+ const fts_ranking_t* rk2 = (const fts_ranking_t*) p2;
+
+ return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const doc_id_t* up1 = static_cast<const doc_id_t*>(p1);
+ const doc_id_t* up2 = static_cast<const doc_id_t*>(p2);
+
+ return static_cast<int>(*up1 - *up2);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+ const CHARSET_INFO* cs, /*!< in: Character set */
+ const uchar* p2, /*!< in: string */
+ const ulint len2); /*!< in: string length */
+
+/** Check if fts index charset is cjk
+@param[in] cs charset
+@retval true if the charset is cjk
+@retval false if not. */
+inline bool fts_is_charset_cjk(const CHARSET_INFO* cs)
+{
+ switch (cs->number) {
+ case 24: /* my_charset_gb2312_chinese_ci */
+ case 28: /* my_charset_gbk_chinese_ci */
+ case 1: /* my_charset_big5_chinese_ci */
+ case 12: /* my_charset_ujis_japanese_ci */
+ case 13: /* my_charset_sjis_japanese_ci */
+ case 95: /* my_charset_cp932_japanese_ci */
+ case 97: /* my_charset_eucjpms_japanese_ci */
+ case 19: /* my_charset_euckr_korean_ci */
+ return true;
+ default:
+ return false;
+ }
+}
+
+/** Select the FTS auxiliary index for the given character by range.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_range(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulint selected = 0;
+ ulint value = innobase_strnxfrm(cs, str, len);
+
+ while (fts_index_selector[selected].value != 0) {
+
+ if (fts_index_selector[selected].value == value) {
+
+ return(selected);
+
+ } else if (fts_index_selector[selected].value > value) {
+
+ return(selected > 0 ? selected - 1 : 0);
+ }
+
+ ++selected;
+ }
+
+ ut_ad(selected > 1);
+
+ return(selected - 1);
+}
+
+/** Select the FTS auxiliary index for the given character by hash.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_hash(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulong nr1 = 1;
+ ulong nr2 = 4;
+
+ ut_ad(!(str == NULL && len > 0));
+
+ if (str == NULL || len == 0) {
+ return 0;
+ }
+
+ /* Get the first char */
+ /* JAN: TODO: MySQL 5.7 had
+ char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
+ reinterpret_cast<const char*>(str + len));
+ */
+ size_t char_len = size_t(cs->charlen(str, str + len));
+
+ ut_ad(char_len <= len);
+
+ /* Get collation hash code */
+ my_ci_hash_sort(cs, str, char_len, &nr1, &nr2);
+
+ return(nr1 % FTS_NUM_AUX_INDEX);
+}
+
+/** Select the FTS auxiliary index for the given character.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length in bytes
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulint selected;
+
+ if (fts_is_charset_cjk(cs)) {
+ selected = fts_select_index_by_hash(cs, str, len);
+ } else {
+ selected = fts_select_index_by_range(cs, str, len);
+ }
+
+ return(selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+ ulint selected) /*!< in: selected index */
+{
+ return(fts_index_selector[selected].suffix);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/innobase/include/fts0vlc.h b/storage/innobase/include/fts0vlc.h
new file mode 100644
index 00000000..d6e60377
--- /dev/null
+++ b/storage/innobase/include/fts0vlc.h
@@ -0,0 +1,124 @@
+/**
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+**/
+/**
+@file include/fts0vlc.h
+Full text variable length integer encoding/decoding.
+
+Created 2021-10-19 Thirunarayanan Balathandayuthapani
+**/
+
+/** Return length of val if it were encoded using our VLC scheme.
+@param val value to encode
+@return length of value encoded, in bytes */
+inline size_t fts_get_encoded_len(doc_id_t val)
+{
+ if (val < static_cast<doc_id_t>(1) << 7)
+ return 1;
+ if (val < static_cast<doc_id_t>(1) << 14)
+ return 2;
+ if (val < static_cast<doc_id_t>(1) << 21)
+ return 3;
+ if (val < static_cast<doc_id_t>(1) << 28)
+ return 4;
+ if (val < static_cast<doc_id_t>(1) << 35)
+ return 5;
+ if (val < static_cast<doc_id_t>(1) << 42)
+ return 6;
+ if (val < static_cast<doc_id_t>(1) << 49)
+ return 7;
+ if (val < static_cast<doc_id_t>(1) << 56)
+ return 8;
+ if (val < static_cast<doc_id_t>(1) << 63)
+ return 9;
+ return 10;
+}
+
+/** Encode an integer using our VLC scheme and return the
+length in bytes.
+@param val value to encode
+@param buf buffer, must have enough space
+@return length of value encoded, in bytes */
+inline byte *fts_encode_int(doc_id_t val, byte *buf)
+{
+ if (val < static_cast<doc_id_t>(1) << 7)
+ goto add_1;
+ if (val < static_cast<doc_id_t>(1) << 14)
+ goto add_2;
+ if (val < static_cast<doc_id_t>(1) << 21)
+ goto add_3;
+ if (val < static_cast<doc_id_t>(1) << 28)
+ goto add_4;
+ if (val < static_cast<doc_id_t>(1) << 35)
+ goto add_5;
+ if (val < static_cast<doc_id_t>(1) << 42)
+ goto add_6;
+ if (val < static_cast<doc_id_t>(1) << 49)
+ goto add_7;
+ if (val < static_cast<doc_id_t>(1) << 56)
+ goto add_8;
+ if (val < static_cast<doc_id_t>(1) << 63)
+ goto add_9;
+
+ *buf++= static_cast<byte>(val >> 63);
+add_9:
+ *buf++= static_cast<byte>(val >> 56) & 0x7F;
+add_8:
+ *buf++= static_cast<byte>(val >> 49) & 0x7F;
+add_7:
+ *buf++= static_cast<byte>(val >> 42) & 0x7F;
+add_6:
+ *buf++= static_cast<byte>(val >> 35) & 0x7F;
+add_5:
+ *buf++= static_cast<byte>(val >> 28) & 0x7F;
+add_4:
+ *buf++= static_cast<byte>(val >> 21) & 0x7F;
+add_3:
+ *buf++= static_cast<byte>(val >> 14) & 0x7F;
+add_2:
+ *buf++= static_cast<byte>(val >> 7) & 0x7F;
+add_1:
+ *buf++= static_cast<byte>(val) | 0x80;
+ return buf;
+}
+
+/** Decode and return the integer that was encoded using
+our VLC scheme.
+@param ptr pointer to decode from, this ptr is
+ incremented by the number of bytes decoded
+@return value decoded */
+inline doc_id_t fts_decode_vlc(const byte **ptr)
+{
+ ut_d(const byte *const start= *ptr);
+ ut_ad(*start);
+
+ doc_id_t val= 0;
+ for (;;)
+ {
+ byte b= *(*ptr)++;
+ val|= (b & 0x7F);
+
+ /* High-bit on means "last byte in the encoded integer". */
+ if (b & 0x80)
+ break;
+ ut_ad(val < static_cast<doc_id_t>(1) << (64 - 7));
+ val <<= 7;
+ }
+
+ ut_ad(*ptr - start <= 10);
+
+ return(val);
+}
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
new file mode 100644
index 00000000..746dab80
--- /dev/null
+++ b/storage/innobase/include/fut0lst.h
@@ -0,0 +1,156 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#pragma once
+
+/* The physical size of a list base node in bytes */
+#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
+/* The physical size of a list node in bytes */
+#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
+
+#ifdef UNIV_INNOCHECKSUM
+# include "fil0fil.h"
+#else
+# include "mtr0log.h"
+
+typedef byte flst_base_node_t;
+typedef byte flst_node_t;
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV 0 /* 6-byte address of the previous list element;
+ the page part of address is FIL_NULL, if no
+ previous element */
+#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next
+ list element; the page part of address
+ is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN 0 /* 32-bit list length field */
+#define FLST_FIRST 4 /* 6-byte address of the first element
+ of the list; undefined if empty list */
+#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the
+ last element of the list; undefined
+ if empty list */
+
+/** Initialize a zero-initialized list base node.
+@param[in,out] block file page
+@param[in] ofs byte offset of the list base node
+@param[in,out] mtr mini-transaction */
+inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
+{
+ ut_d(const page_t *page= block->page.frame);
+ ut_ad(!mach_read_from_2(FLST_LEN + ofs + page));
+ ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + page));
+ ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + page));
+ compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
+ mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+ mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+}
+
+/** Initialize a list base node.
+@param[in] block file page
+@param[in,out] base base node
+@param[in,out] mtr mini-transaction */
+void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Append a file list node to a list.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the node to be added
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Prepend a file list node to a list.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the node to be added
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Remove a file list node.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] cur block to be removed
+@param[in] coffset byte offset of the current record to be removed
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+ buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the length of a list */
+inline uint32_t flst_get_len(const flst_base_node_t *base)
+{
+ return mach_read_from_4(base + FLST_LEN);
+}
+
+/** @return a file address */
+inline fil_addr_t flst_read_addr(const byte *faddr)
+{
+ fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
+ mach_read_from_2(faddr + FIL_ADDR_BYTE) };
+ ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+ return addr;
+}
+
+/** @return list first node address */
+inline fil_addr_t flst_get_first(const flst_base_node_t *base)
+{
+ return flst_read_addr(base + FLST_FIRST);
+}
+
+/** @return list last node address */
+inline fil_addr_t flst_get_last(const flst_base_node_t *base)
+{
+ return flst_read_addr(base + FLST_LAST);
+}
+
+/** @return list next node address */
+inline fil_addr_t flst_get_next_addr(const flst_node_t* node)
+{
+ return flst_read_addr(node + FLST_NEXT);
+}
+
+/** @return list prev node address */
+inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
+{
+ return flst_read_addr(node + FLST_PREV);
+}
+
+# ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
+# endif
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h
new file mode 100644
index 00000000..3fd01a3a
--- /dev/null
+++ b/storage/innobase/include/gis0geo.h
@@ -0,0 +1,122 @@
+/*****************************************************************************
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software Foundation,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*****************************************************************************/
+
+/**************************************************//**
+@file gis0geo.h
+The r-tree define from MyISAM
+*******************************************************/
+
+#ifndef _gis0geo_h
+#define _gis0geo_h
+
+#include "my_global.h"
+#include "string.h"
+
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN 8
+
+/* Since the mbr could be a point or a linestring, in this case, area of
+mbr is 0. So, we define this macro for calculating the area increasing
+when we need to enlarge the mbr. */
+#define LINE_MBR_WEIGHTS 0.001
+
+/* Types of "well-known binary representation" (wkb) format. */
+enum wkbType
+{
+ wkbPoint = 1,
+ wkbLineString = 2,
+ wkbPolygon = 3,
+ wkbMultiPoint = 4,
+ wkbMultiLineString = 5,
+ wkbMultiPolygon = 6,
+ wkbGeometryCollection = 7
+};
+
+/* Byte order of "well-known binary representation" (wkb) format. */
+enum wkbByteOrder
+{
+ wkbXDR = 0, /* Big Endian */
+ wkbNDR = 1 /* Little Endian */
+};
+
+/*************************************************************//**
+Calculate minimal bounding rectangle (mbr) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+@return 0 if ok */
+int
+rtree_mbr_from_wkb(
+/*===============*/
+ const uchar* wkb, /*!< in: pointer to wkb. */
+ uint size, /*!< in: size of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr); /*!< in/out: mbr. */
+
+/* Rtree split node structure. */
+struct rtr_split_node_t
+{
+ double square; /* square of the mbr.*/
+ int n_node; /* which group in.*/
+ uchar* key; /* key. */
+ double* coords; /* mbr. */
+};
+
+/*************************************************************//**
+Inline function for reserving coords */
+inline
+static
+double*
+reserve_coords(double **d_buffer, /*!< in/out: buffer. */
+ int n_dim) /*!< in: dimensions. */
+/*===========*/
+{
+ double *coords = *d_buffer;
+ (*d_buffer) += n_dim * 2;
+ return coords;
+}
+
+/*************************************************************//**
+Split rtree nodes.
+Return which group the first rec is in. */
+int
+split_rtree_node(
+/*=============*/
+ rtr_split_node_t* node, /*!< in: split nodes.*/
+ int n_entries, /*!< in: entries number.*/
+ int all_size, /*!< in: total key's size.*/
+ int key_size, /*!< in: key's size.*/
+ int min_size, /*!< in: minimal group size.*/
+ int size1, /*!< in: size of group.*/
+ int size2, /*!< in: initial group sizes */
+ double** d_buffer, /*!< in/out: buffer.*/
+ int n_dim, /*!< in: dimensions. */
+ uchar* first_rec); /*!< in: the first rec. */
+
+/** Compare two minimum bounding rectangles.
+@param mode comparison operator
+ MBR_INTERSECT(a,b) a overlaps b
+ MBR_CONTAIN(a,b) a contains b
+ MBR_DISJOINT(a,b) a disjoint b
+ MBR_WITHIN(a,b) a within b
+ MBR_EQUAL(a,b) All coordinates of MBRs are equal
+ MBR_DATA(a,b) Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a);
+#endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
new file mode 100644
index 00000000..b07261ce
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.h
@@ -0,0 +1,513 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+#ifndef gis0rtree_h
+#define gis0rtree_h
+
+#include "btr0cur.h"
+#include "rem0types.h"
+
+/* Whether MBR 'a' contains 'b' */
+#define MBR_CONTAIN_CMP(a, b) \
+ ((((b)->xmin >= (a)->xmin) && ((b)->xmax <= (a)->xmax) \
+ && ((b)->ymin >= (a)->ymin) && ((b)->ymax <= (a)->ymax)))
+
+/* Whether MBR 'a' equals to 'b' */
+#define MBR_EQUAL_CMP(a, b) \
+ ((((b)->xmin == (a)->xmin) && ((b)->xmax == (a)->xmax)) \
+ && (((b)->ymin == (a)->ymin) && ((b)->ymax == (a)->ymax)))
+
+/* Whether MBR 'a' intersects 'b' */
+#define MBR_INTERSECT_CMP(a, b) \
+ ((((b)->xmin <= (a)->xmax) || ((b)->xmax >= (a)->xmin)) \
+ && (((b)->ymin <= (a)->ymax) || ((b)->ymax >= (a)->ymin)))
+
+/* Whether MBR 'a' and 'b' disjoint */
+#define MBR_DISJOINT_CMP(a, b) (!MBR_INTERSECT_CMP(a, b))
+
+/* Whether MBR 'a' within 'b' */
+#define MBR_WITHIN_CMP(a, b) \
+ ((((b)->xmin <= (a)->xmin) && ((b)->xmax >= (a)->xmax)) \
+ && (((b)->ymin <= (a)->ymin) && ((b)->ymax >= (a)->ymax)))
+
+/* Define it for rtree search mode checking. */
+#define RTREE_SEARCH_MODE(mode) \
+ (((mode) >= PAGE_CUR_CONTAIN) && ((mode <= PAGE_CUR_RTREE_GET_FATHER)))
+
+/* Geometry data header */
+#define GEO_DATA_HEADER_SIZE 4
+
+/** Search for a spatial index leaf page record.
+@param cur cursor
+@param tuple search tuple
+@param latch_mode latching mode
+@param mtr mini-transaction
+@param mode search mode */
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+ btr_latch_mode latch_mode, mtr_t *mtr,
+ page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Search for inserting a spatial index leaf page record.
+@param cur cursor
+@param tuple search tuple
+@param latch_mode latching mode
+@param mtr mini-transaction */
+inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+ btr_latch_mode latch_mode, mtr_t *mtr)
+{
+ return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur cursor
+@param tuple search tuple
+@param mode search mode
+@param mtr mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+ page_cur_mode_t mode, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+ page_cur_mode_t mode,
+ btr_latch_mode latch_mode,
+ btr_cur_t *cur, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Builds a Rtree node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+rtr_index_build_node_ptr(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ const rtr_mbr_t* mbr, /*!< in: mbr of lower page */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap); /*!< in: memory heap where pointer
+ created */
+
+/*************************************************************//**
+Splits an R-tree index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record */
+rec_t*
+rtr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in/out: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err); /*!< out: error code */
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+ const dict_index_t* index, /*!< in: index */
+ const buf_block_t* block, /*!< in: buffer block */
+ rtr_mbr_t* mbr, /*!< out: MBR encapsulates the page */
+ mem_heap_t* heap); /*!< in: heap for the memory
+ allocation */
+/*************************************************************//**
+Find the next matching record. This function will first exhaust
+the copied record listed in the rtr_info->matches vector before
+moving to next page
+@return true if there is next qualified record found, otherwise(if
+exhausted) false */
+bool
+rtr_pcur_move_to_next(
+/*==================*/
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ page_cur_mode_t mode, /*!< in: cursor search mode */
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ ulint cur_level,
+ /*!< in: current level */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/****************************************************************//**
+Searches the right position in rtree for a page cursor. */
+bool
+rtr_cur_search_with_match(
+/*======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ dict_index_t* index, /*!< in: index descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ rtr_info_t* rtr_info);/*!< in/out: search stack */
+
+/****************************************************************//**
+Calculate the area increased for a new record
+@return area increased */
+double
+rtr_rec_cal_increase(
+/*=================*/
+ const dtuple_t* dtuple, /*!< in: data tuple to insert, which
+ cause area increase */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ double* area); /*!< out: increased area */
+
+/****************************************************************//**
+Following the right link to find the proper block for insert.
+@return the proper block.*/
+dberr_t
+rtr_ins_enlarge_mbr(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+ rtr_node_path_t* path, /*!< in/out: search path */
+ uint32_t pageno, /*!< in: pageno to insert */
+ node_seq_t seq_no, /*!< in: Node sequence num */
+ ulint level, /*!< in: index level */
+ uint32_t child_no, /*!< in: child page no */
+ btr_pcur_t* cursor, /*!< in: position cursor */
+ double mbr_inc); /*!< in: MBR needs to be
+ enlarged */
+
+/**************************************************************//**
+push a nonleaf index node to the search path for insertion */
+void
+rtr_non_leaf_insert_stack_push(
+/*===========================*/
+ dict_index_t* index, /*!< in: index descriptor */
+ rtr_node_path_t* path, /*!< in/out: search path */
+ ulint level, /*!< in: index level */
+ const buf_block_t* block, /*!< in: block of the page */
+ const rec_t* rec, /*!< in: positioned record */
+ double mbr_inc); /*!< in: MBR needs to be
+ enlarged */
+
+#define rtr_get_new_ssn_id(index) (index)->assign_ssn()
+#define rtr_get_current_ssn_id(index) (index)->ssn()
+
+/********************************************************************//**
+Create a RTree search info structure */
+rtr_info_t*
+rtr_create_rtr_info(
+/******************/
+ bool need_prdt, /*!< in: Whether predicate lock is
+ needed */
+ bool init_matches, /*!< in: Whether to initiate the
+ "matches" structure for collecting
+ matched leaf records */
+ btr_cur_t* cursor, /*!< in: tree search cursor */
+ dict_index_t* index); /*!< in: index struct */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_info_update_btr(
+/******************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ rtr_info_t* rtr_info); /*!< in: rtr_info to set to the
+ cursor */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_init_rtr_info(
+/****************/
+ rtr_info_t* rtr_info, /*!< in: rtr_info to set to the
+ cursor */
+ bool need_prdt, /*!< in: Whether predicate lock is
+ needed */
+ btr_cur_t* cursor, /*!< in: tree search cursor */
+ dict_index_t* index, /*!< in: index structure */
+ bool reinit); /*!< in: Whether this is a reinit */
+
+/**************************************************************//**
+Clean up Rtree cursor */
+void
+rtr_clean_rtr_info(
+/*===============*/
+ rtr_info_t* rtr_info, /*!< in: RTree search info */
+ bool free_all); /*!< in: need to free rtr_info itself */
+
+/****************************************************************//**
+Get the bounding box content from an index record*/
+void
+rtr_get_mbr_from_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: data tuple */
+ const rec_offs* offsets,/*!< in: offsets array */
+ rtr_mbr_t* mbr); /*!< out MBR */
+
+/****************************************************************//**
+Get the bounding box content from a MBR data record */
+void
+rtr_get_mbr_from_tuple(
+/*===================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ rtr_mbr* mbr); /*!< out: mbr to fill */
+
+/* Get the rtree page father.
+@param[in,out] mtr mtr
+@param[in] sea_cur search cursor, contains information
+ about parent nodes in search
+@param[in,out] cursor cursor on node pointer record,
+ its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/************************************************************//**
+Returns the father block to a page. It is assumed that mtr holds
+an X or SX latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+rec_offs*
+rtr_page_get_father_block(
+/*======================*/
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* sea_cur,/*!< in: search cursor, contains information
+ about parent nodes in search */
+ btr_cur_t* cursor);/*!< out: cursor on node pointer record,
+ its page x-latched */
+/**************************************************************//**
+Store the parent path cursor
+@return number of cursor stored */
+ulint
+rtr_store_parent_path(
+/*==================*/
+ const buf_block_t* block, /*!< in: block of the page */
+ btr_cur_t* btr_cur,/*!< in/out: persistent cursor */
+ btr_latch_mode latch_mode,
+ /*!< in: latch_mode */
+ ulint level, /*!< in: index level */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+bool rtr_search(
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert); /*!< in: whether it is insert */
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert); /*!< in: whether insert operation */
+
+MY_ATTRIBUTE((warn_unused_result))
+/*************************************************************//**
+Copy recs from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_end_no_locks(
+/*================================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ rtr_rec_move_t* rec_move, /*!< in: recording records moved */
+ ulint max_move, /*!< in: num of rec to move */
+ ulint* num_moved, /*!< out: num of rec to move */
+ mtr_t* mtr); /*!< in: mtr */
+
+MY_ATTRIBUTE((warn_unused_result))
+/*************************************************************//**
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_start_no_locks(
+/*==================================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ rtr_rec_move_t* rec_move, /*!< in: recording records moved */
+ ulint max_move, /*!< in: num of rec to move */
+ ulint* num_moved, /*!< out: num of rec to move */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************//**
+Merge 2 mbrs and update the the mbr that cursor is on. */
+void
+rtr_merge_and_update_mbr(
+/*=====================*/
+ btr_cur_t* cursor, /*!< in/out: cursor */
+ btr_cur_t* cursor2, /*!< in: the other cursor */
+ rec_offs* offsets, /*!< in: rec offsets */
+ rec_offs* offsets2, /*!< in: rec offsets */
+ page_t* child_page, /*!< in: the child page. */
+ mtr_t* mtr); /*!< in: mtr */
+
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+void
+rtr_node_ptr_delete(
+/*================*/
+ btr_cur_t* cursor, /*!< in: search cursor, contains information
+ about parent nodes in search */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************//**
+Check two MBRs are identical or need to be merged */
+bool
+rtr_merge_mbr_changed(
+/*==================*/
+ btr_cur_t* cursor, /*!< in: cursor */
+ btr_cur_t* cursor2, /*!< in: the other cursor */
+ rec_offs* offsets, /*!< in: rec offsets */
+ rec_offs* offsets2, /*!< in: rec offsets */
+ rtr_mbr_t* new_mbr); /*!< out: MBR to update */
+
+
+/**************************************************************//**
+Update the mbr field of a spatial index row. */
+void
+rtr_update_mbr_field(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: cursor pointed to rec.*/
+ rec_offs* offsets, /*!< in: offsets on rec. */
+ btr_cur_t* cursor2, /*!< in/out: cursor pointed to rec
+ that should be deleted.
+ this cursor is for btr_compress to
+ delete the merged page's father rec.*/
+ page_t* child_page, /*!< in: child page. */
+ rtr_mbr_t* new_mbr, /*!< in: the new mbr. */
+ rec_t* new_rec, /*!< in: rec to use */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+Check whether a Rtree page is child of a parent page
+@return true if there is child/parent relationship */
+bool
+rtr_check_same_block(
+/*=================*/
+ dict_index_t* index, /*!< in: index tree */
+ btr_cur_t* cur, /*!< in/out: position at the parent entry
+ pointing to the child if successful */
+ buf_block_t* parentb,/*!< in: parent page to check */
+ mem_heap_t* heap); /*!< in: memory heap */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+ byte* data, /*!< out: data */
+ const rtr_mbr_t* mbr); /*!< in: data */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+ const byte* data, /*!< in: data */
+ rtr_mbr_t* mbr); /*!< out: data */
+
+/**************************************************************//**
+Check whether a discarding page is in anyone's search path */
+void
+rtr_check_discard_page(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ buf_block_t* block); /*!< in: block of page to be discarded */
+
+/********************************************************************//**
+Reinitialize a RTree search info */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ dict_index_t* index, /*!< in: index struct */
+ bool need_prdt); /*!< in: Whether predicate lock is
+ needed */
+
+/** Estimates the number of rows in a given area.
+@param[in] index index
+@param[in] tuple range tuple containing mbr, may also be empty tuple
+@param[in] mode search mode
+@return estimated number of rows */
+ha_rows
+rtr_estimate_n_rows_in_range(
+ dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode);
+
+#include "gis0rtree.inl"
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
new file mode 100644
index 00000000..5101eeb6
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.inl
@@ -0,0 +1,245 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree Inline code
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+ const dict_index_t* index, /*!< in: index */
+ const buf_block_t* block, /*!< in: buffer block */
+ rtr_mbr_t* rtr_mbr,/*!< out: MBR encapsulates the page */
+ mem_heap_t* heap) /*!< in: heap for the memory
+ allocation */
+{
+ page_t* page;
+ rec_t* rec;
+ const byte* field;
+ ulint len;
+ rec_offs* offsets = NULL;
+ double bmin, bmax;
+ double* amin;
+ double* amax;
+ ulint inc = 0;
+ double* mbr;
+
+ rtr_mbr->xmin = DBL_MAX;
+ rtr_mbr->ymin = DBL_MAX;
+ rtr_mbr->xmax = -DBL_MAX;
+ rtr_mbr->ymax = -DBL_MAX;
+
+ mbr = reinterpret_cast<double*>(rtr_mbr);
+
+ page = buf_block_get_frame(block);
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ if (UNIV_UNLIKELY(!rec)) {
+ return;
+ }
+ offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ do {
+ /* The mbr address is in the first field. */
+ field = rec_get_nth_field(rec, offsets, 0, &len);
+
+ ut_ad(len == DATA_MBR_LEN);
+ inc = 0;
+ for (unsigned i = 0; i < SPDIMS; i++) {
+ bmin = mach_double_read(field + inc);
+ bmax = mach_double_read(field + inc + sizeof(double));
+
+ amin = mbr + i * SPDIMS;
+ amax = mbr + i * SPDIMS + 1;
+
+ if (*amin > bmin)
+ *amin = bmin;
+ if (*amax < bmax)
+ *amax = bmax;
+
+ inc += 2 * sizeof(double);
+ }
+
+ rec = page_rec_get_next(rec);
+
+ if (rec == NULL) {
+ break;
+ }
+ } while (!page_rec_is_supremum(rec));
+}
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+ rtr_node_path_t* path, /*!< in/out: search path */
+ uint32_t pageno, /*!< in: pageno to insert */
+ node_seq_t seq_no, /*!< in: Node sequence num */
+ ulint level, /*!< in: index page level */
+ uint32_t child_no, /*!< in: child page no */
+ btr_pcur_t* cursor, /*!< in: position cursor */
+ double mbr_inc) /*!< in: MBR needs to be
+ enlarged */
+{
+ node_visit_t insert_val;
+
+ insert_val.page_no = pageno;
+ insert_val.seq_no = seq_no;
+ insert_val.level = level;
+ insert_val.child_no = child_no;
+ insert_val.cursor = cursor;
+ insert_val.mbr_inc = mbr_inc;
+
+ path->push_back(insert_val);
+
+#ifdef RTR_SEARCH_DIAGNOSTIC
+ fprintf(stderr, "INNODB_RTR: Push page %d, level %d, seq %d"
+ " to search stack \n",
+ static_cast<int>(pageno), static_cast<int>(level),
+ static_cast<int>(seq_no));
+#endif /* RTR_SEARCH_DIAGNOSTIC */
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+ byte* data, /*!< out: data */
+ const rtr_mbr_t* mbr) /*!< in: data */
+{
+ const double* my_mbr = reinterpret_cast<const double*>(mbr);
+
+ for (unsigned i = 0; i < SPDIMS * 2; i++) {
+ mach_double_write(data + i * sizeof(double), my_mbr[i]);
+ }
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+ const byte* data, /*!< in: data */
+ rtr_mbr_t* mbr) /*!< out: MBR */
+{
+ for (unsigned i = 0; i < SPDIMS * 2; i++) {
+ (reinterpret_cast<double*>(mbr))[i] = mach_double_read(
+ data
+ + i * sizeof(double));
+ }
+}
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component in the parent path,
+NULL if parent path is empty or index is larger than num of items contained */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert) /*!< in: whether it is insert */
+{
+ ulint num;
+ ulint tree_height = btr_cur->tree_height;
+ node_visit_t* found_node = NULL;
+
+ if (level >= tree_height) {
+ return(NULL);
+ }
+
+ mysql_mutex_lock(&btr_cur->rtr_info->rtr_path_mutex);
+
+ num = btr_cur->rtr_info->parent_path->size();
+
+ if (!num) {
+ mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
+ return(NULL);
+ }
+
+ if (is_insert) {
+ ulint idx = tree_height - level - 1;
+ ut_ad(idx < num);
+
+ found_node = &(*btr_cur->rtr_info->parent_path)[idx];
+ } else {
+ node_visit_t* node;
+
+ while (num > 0) {
+ node = &(*btr_cur->rtr_info->parent_path)[num - 1];
+
+ if (node->level == level) {
+ found_node = node;
+ break;
+ }
+ num--;
+ }
+ }
+
+ mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
+
+ return(found_node);
+}
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert) /*!< in: whether insert operation */
+{
+ node_visit_t* found_node = rtr_get_parent_node(
+ btr_cur, level, is_insert);
+
+ return((found_node) ? found_node->cursor : NULL);
+}
+
+/********************************************************************//**
+Reinitialize a R-Tree search info in btr_cur_t */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ dict_index_t* index, /*!< in: index struct */
+ bool need_prdt) /*!< in: Whether predicate lock is
+ needed */
+{
+ rtr_clean_rtr_info(cursor->rtr_info, false);
+ rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true);
+}
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
new file mode 100644
index 00000000..d6a4ef67
--- /dev/null
+++ b/storage/innobase/include/gis0type.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0type.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang
+***********************************************************************/
+
+#ifndef gis0type_h
+#define gis0type_h
+
+#include "buf0buf.h"
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "ut0vec.h"
+#include "gis0geo.h"
+
+#include <vector>
+#include <forward_list>
+
+/** Node Sequence Number. Only updated when page splits */
+typedef uint32_t node_seq_t;
+
+/* RTree internal non-leaf Nodes to be searched, from root to leaf */
+struct node_visit_t {
+ uint32_t page_no; /*!< the page number */
+ node_seq_t seq_no; /*!< the SSN (split sequence number */
+ ulint level; /*!< the page's index level */
+ uint32_t child_no; /*!< child page num if for parent
+ recording */
+ btr_pcur_t* cursor; /*!< cursor structure if we positioned
+ FIXME: there is no need to use whole
+ btr_pcur_t, just the position related
+ members */
+ double mbr_inc; /*!< whether this node needs to be
+ enlarged for insertion */
+};
+
+typedef std::vector<node_visit_t, ut_allocator<node_visit_t> > rtr_node_path_t;
+
+typedef struct rtr_rec {
+ rec_t* r_rec; /*!< matched record */
+ bool locked; /*!< whether the record locked */
+} rtr_rec_t;
+
+typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> > rtr_rec_vector;
+
+/* Structure for matched records on the leaf page */
+typedef struct matched_rec {
+ byte* bufp; /*!< aligned buffer point */
+ byte rec_buf[UNIV_PAGE_SIZE_MAX * 2];
+ /*!< buffer used to copy matching rec */
+ buf_block_t block; /*!< the shadow buffer block */
+ ulint used; /*!< memory used */
+ rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */
+ mysql_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs
+ vector */
+ bool valid; /*!< whether result in matched_recs
+ or this search is valid (page not
+ dropped) */
+ bool locked; /*!< whether these recs locked */
+} matched_rec_t;
+
+/* In memory representation of a minimum bounding rectangle */
+typedef struct rtr_mbr {
+ double xmin; /*!< minimum on x */
+ double xmax; /*!< maximum on x */
+ double ymin; /*!< minimum on y */
+ double ymax; /*!< maximum on y */
+} rtr_mbr_t;
+
+/* Maximum index level for R-Tree, this is consistent with BTR_MAX_LEVELS */
+#define RTR_MAX_LEVELS 100
+
+/* Number of pages we latch at leaf level when there is possible Tree
+modification (split, shrink), we always latch left, current
+and right pages */
+#define RTR_LEAF_LATCH_NUM 3
+
+/** Vectors holding the matching internal pages/nodes and leaf records */
+typedef struct rtr_info{
+ rtr_node_path_t*path; /*!< vector holding matching pages */
+ rtr_node_path_t*parent_path;
+ /*!< vector holding parent pages during
+ search */
+ matched_rec_t* matches;/*!< struct holding matching leaf records */
+ mysql_mutex_t rtr_path_mutex;
+ /*!< mutex protect the "path" vector */
+ rtr_mbr_t mbr; /*!< the search MBR */
+ que_thr_t* thr; /*!< the search thread */
+ mem_heap_t* heap; /*!< memory heap */
+ btr_cur_t* cursor; /*!< cursor used for search */
+ dict_index_t* index; /*!< index it is searching */
+ bool need_prdt_lock;
+ /*!< whether we will need predicate lock
+ the tree */
+ bool need_page_lock;
+ /*!< whether we will need predicate page lock
+ the tree */
+ bool allocated;/*!< whether this structure is allocate or
+ on stack */
+ bool mbr_adj;/*!< whether mbr will need to be enlarged
+ for an insertion operation */
+ bool fd_del; /*!< found deleted row */
+ const dtuple_t* search_tuple;
+ /*!< search tuple being used */
+ page_cur_mode_t search_mode;
+ /*!< current search mode */
+} rtr_info_t;
+
+/* Tracking structure for all ongoing search for an index */
+struct rtr_info_track_t {
+ /** Active search info */
+ std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active;
+ mysql_mutex_t rtr_active_mutex;
+ /*!< mutex to protect
+ rtr_active */
+};
+
+/* This is to record the record movement between pages. Used for corresponding
+lock movement */
+typedef struct rtr_rec_move {
+ rec_t* old_rec; /*!< record being moved in old page */
+ rec_t* new_rec; /*!< new record location */
+ bool moved; /*!< whether lock are moved too */
+} rtr_rec_move_t;
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
new file mode 100644
index 00000000..5aaa559b
--- /dev/null
+++ b/storage/innobase/include/ha0ha.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "rem0types.h"
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: folded value of the searched data */
+
+/** The hash table external chain node */
+struct ha_node_t {
+ ulint fold; /*!< fold value for the data */
+ ha_node_t* next; /*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block; /*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t* data; /*!< pointer to the data */
+};
+
+#include "ha0ha.inl"
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#endif
diff --git a/storage/innobase/include/ha0ha.inl b/storage/innobase/include/ha0ha.inl
new file mode 100644
index 00000000..0b256257
--- /dev/null
+++ b/storage/innobase/include/ha0ha.inl
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef BTR_CUR_HASH_ADAPT
+#include "btr0types.h"
+
+/******************************************************************//**
+Gets a hash node data.
+@return pointer to the data */
+UNIV_INLINE
+const rec_t*
+ha_node_get_data(
+/*=============*/
+ const ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+ ha_node_t* node, /*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t* data) /*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+ const ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold value determining the chain */
+{
+ return static_cast<ha_node_t*>(table->array[table->calc_hash(fold)].node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: folded value of the searched data */
+{
+ ut_ad(btr_search_enabled);
+
+ for (const ha_node_t* node = ha_chain_get_first(table, fold);
+ node != NULL;
+ node = ha_chain_get_next(node)) {
+
+ if (node->fold == fold) {
+
+ return(node->data);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ const rec_t* data) /*!< in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(btr_search_enabled);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->data == data) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
new file mode 100644
index 00000000..fdf50a2e
--- /dev/null
+++ b/storage/innobase/include/ha0storage.h
@@ -0,0 +1,137 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS 4096
+
+/** Hash storage */
+struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells); /*!< in: initial number of cells
+ in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return pointer to the copy */
+const void*
+ha_storage_put_memlim(
+/*==================*/
+ ha_storage_t* storage, /*!< in/out: hash storage */
+ const void* data, /*!< in: data to store */
+ ulint data_len, /*!< in: data length */
+ ulint memlim); /*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage in/out: hash storage
+@param data in: data to store
+@param data_len in: data length
+@return pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len) \
+ ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@return pointer to the copy of the string */
+#define ha_storage_put_str(storage, str) \
+ ((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@param memlim in: memory limit to obey
+@return pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim) \
+ ((const char*) ha_storage_put_memlim((storage), (str), \
+ strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage); /*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage); /*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage); /*!< in: hash storage */
+
+#include "ha0storage.inl"
+
+#endif /* ha0storage_h */
diff --git a/storage/innobase/include/ha0storage.inl b/storage/innobase/include/ha0storage.inl
new file mode 100644
index 00000000..df9679cf
--- /dev/null
+++ b/storage/innobase/include/ha0storage.inl
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_t {
+ mem_heap_t* heap; /*!< memory heap from which memory is
+ allocated */
+ hash_table_t hash; /*!< hash table used to avoid
+ duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+struct ha_storage_node_t {
+ ulint data_len;/*!< length of the data */
+ const void* data; /*!< pointer to data */
+ ha_storage_node_t* next; /*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells) /*!< in: initial number of cells
+ in the hash table */
+{
+ ha_storage_t* storage;
+ mem_heap_t* heap;
+
+ if (initial_heap_bytes == 0) {
+
+ initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+ }
+
+ if (initial_hash_cells == 0) {
+
+ initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+ }
+
+ /* we put "storage" within "storage->heap" */
+
+ heap = mem_heap_create(sizeof(ha_storage_t)
+ + initial_heap_bytes);
+
+ storage = (ha_storage_t*) mem_heap_alloc(heap,
+ sizeof(ha_storage_t));
+
+ storage->heap = heap;
+ storage->hash.create(initial_hash_cells);
+
+ return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage) /*!< in/out: hash storage */
+{
+ ha_storage_t temp_storage;
+
+ temp_storage.heap = (*storage)->heap;
+ temp_storage.hash = (*storage)->hash;
+
+ temp_storage.hash.clear();
+ mem_heap_empty(temp_storage.heap);
+
+ *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+ sizeof(ha_storage_t));
+
+ (*storage)->heap = temp_storage.heap;
+ (*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage) /*!< in, own: hash storage */
+{
+ storage->hash.free();
+ mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage) /*!< in: hash storage */
+{
+ ulint ret;
+
+ ret = mem_heap_get_size(storage->heap);
+
+ /* this assumes hash->heap and hash->heaps are NULL */
+ ret += sizeof(hash_table_t);
+ ret += sizeof(hash_cell_t) * storage->hash.n_cells;
+
+ return(ret);
+}
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
new file mode 100644
index 00000000..d5239ec3
--- /dev/null
+++ b/storage/innobase/include/ha_prototypes.h
@@ -0,0 +1,476 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code.
+
+NOTE: This header is intended to insulate InnoDB from SQL names and functions.
+Do not include any headers other than univ.i into this unless they are very
+simple headers.
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* Forward declarations */
+class THD;
+class Field;
+
+// JAN: TODO missing features:
+#undef MYSQL_FT_INIT_EXT
+#undef MYSQL_PFS
+#undef MYSQL_STORE_FTS_DOC_ID
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name); /*!< in: concatenation of
+ database name, path separator,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+
+/** Quote a standard SQL identifier like tablespace, index or column name.
+@param[in] file output stream
+@param[in] trx InnoDB transaction, or NULL
+@param[in] id identifier to quote */
+void
+innobase_quote_identifier(
+ FILE* file,
+ trx_t* trx,
+ const char* id);
+
+/** Quote an standard SQL identifier like tablespace, index or column name.
+Return the string as an std:string object.
+@param[in] trx InnoDB transaction, or NULL
+@param[in] id identifier to quote
+@return a std::string with id properly quoted. */
+std::string
+innobase_quote_identifier(
+ trx_t* trx,
+ const char* id);
+
+/*****************************************************************//**
+Convert a table name to the MySQL system_charset_info (UTF-8).
+@return pointer to the end of buf */
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: table name to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd); /*!< in: MySQL connection thread, or NULL */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ THD* thd); /*!< in: thread handle */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ THD* thd, /*!< in: pointer to a MySQL THD object */
+ uint max_query_len); /*!< in: max query length to print, or 0 to
+ use the default max length */
+
+/** Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@param[out] unsigned_flag DATA_UNSIGNED if an 'unsigned type';
+at least ENUM and SET, and unsigned integer types are 'unsigned types'
+@param[in] f MySQL Field
+@return DATA_BINARY, DATA_VARCHAR, ... */
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b); /*!< in: second string to compare */
+
+/** Strip dir name from a full path name and return only the file name
+@param[in] path_name full path name
+@return file name or "null" if no file name */
+const char*
+innobase_basename(
+ const char* path_name);
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ CHARSET_INFO* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes; should
+ be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+void
+innobase_convert_from_id(
+/*=====================*/
+ CHARSET_INFO* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes;
+ should be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+void
+innobase_casedn_str(
+/*================*/
+ char* a); /*!< in/out: string to put in lower case */
+
+#ifdef WITH_WSREP
+ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
+ unsigned char* str, ulint str_length,
+ ulint buf_length);
+#endif /* WITH_WSREP */
+
+extern "C" struct charset_info_st *thd_charset(THD *thd);
+
+/** Get high resolution timestamp for the current query start time.
+The timestamp is not anchored to any specific point in time,
+but can be used for comparison.
+@param thd user thread
+@retval timestamp in microseconds precision
+*/
+extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd);
+
+
+/** Determines the current SQL statement.
+Thread unsafe, can only be called from the thread owning the THD.
+@param[in] thd MySQL thread handle
+@param[out] length Length of the SQL statement
+@return SQL statement string */
+const char*
+innobase_get_stmt_unsafe(
+ THD* thd,
+ size_t* length);
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+ ulint charset_id, /*!< in: character set id */
+ ulint prefix_len, /*!< in: prefix length in bytes of the index
+ (this has to be divided by mbmaxlen to get the
+ number of CHARACTERS n in the prefix) */
+ ulint data_len, /*!< in: length of the string in bytes */
+ const char* str); /*!< in: character string */
+
+/** Get status of innodb_tmpdir.
+@param[in] thd thread handle, or NULL to query
+ the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+const char *thd_innodb_tmpdir(THD *thd);
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+uint&
+thd_lock_wait_timeout(
+/*==================*/
+ THD* thd); /*!< in: thread handle, or NULL to query
+ the global innodb_lock_wait_timeout */
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+bool
+thd_trx_is_read_only(
+/*=================*/
+ THD* thd); /*!< in/out: thread handle */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+ THD* thd); /*!< in: thread handle, or NULL */
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table name
+to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+void
+innobase_format_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* name); /*!< in: table name to format */
+
+/** Corresponds to Sql_condition:enum_warning_level. */
+enum ib_log_level_t {
+ IB_LOG_LEVEL_INFO,
+ IB_LOG_LEVEL_WARN,
+ IB_LOG_LEVEL_ERROR,
+ IB_LOG_LEVEL_FATAL
+};
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+ THD *thd, Sql_condition::enum_warning_level level,
+ uint code, const char *format, ...);
+*/
+void
+ib_errf(
+/*====*/
+ THD* thd, /*!< in/out: session */
+ ib_log_level_t level, /*!< in: warning level */
+ ib_uint32_t code, /*!< MySQL error code */
+ const char* format, /*!< printf format */
+ ...) /*!< Args */
+ MY_ATTRIBUTE((format(printf, 4, 5)));
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+ THD *thd, Sql_condition::enum_warning_level level,
+ uint code, const char *format, ...);
+*/
+void
+ib_senderrf(
+/*========*/
+ THD* thd, /*!< in/out: session */
+ ib_log_level_t level, /*!< in: warning level */
+ ib_uint32_t code, /*!< MySQL error code */
+ ...); /*!< Args */
+
+extern const char* TROUBLESHOOTING_MSG;
+extern const char* TROUBLESHOOT_DATADICT_MSG;
+extern const char* BUG_REPORT_MSG;
+extern const char* FORCE_RECOVERY_MSG;
+extern const char* OPERATING_SYSTEM_ERROR_MSG;
+extern const char* FOREIGN_KEY_CONSTRAINTS_MSG;
+extern const char* SET_TRANSACTION_MSG;
+extern const char* INNODB_PARAMETERS_MSG;
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+const char*
+server_get_hostname();
+/*=================*/
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+ INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+ ulonglong current, /*!< in: Current value */
+ ulonglong need, /*!< in: count of values needed */
+ ulonglong step, /*!< in: AUTOINC increment step */
+ ulonglong offset, /*!< in: AUTOINC offset */
+ ulonglong max_value) /*!< in: max value for type */
+ MY_ATTRIBUTE((pure, warn_unused_result));
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len, /* in: length of 'to', in bytes */
+ uint* errors); /* out: error return */
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+The input to this function is an identifier in charset my_charset_filename.
+return true when length of identifier is too long. */
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+ const char* id); /* in: identifier to check. it must belong
+ to charset my_charset_filename */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len, /* in: length of 'to', in bytes */
+ uint* errors); /* out: error return */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len); /* in: length of 'to', in bytes */
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+ trx_t* trx, /*!< in: trx */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *format,/*!< in: warning message */
+ ...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+ void* ithd, /*!< in: thd */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *format,/*!< in: warning message */
+ ...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_foreign_warn(
+ trx_t* trx, /*!< in: trx */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *table_name,
+ const char *format,/*!< in: warning message */
+ ...);
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+void
+normalize_table_name_c_low(
+/*=======================*/
+ char* norm_name, /*!< out: normalized name as a
+ null-terminated string */
+ const char* name, /*!< in: table name string */
+ bool set_lower_case); /*!< in: true if we want to set
+ name to lower case */
+
+/** Create a MYSQL_THD for a background thread and mark it as such.
+@param name thread info for SHOW PROCESSLIST
+@return new MYSQL_THD */
+MYSQL_THD innobase_create_background_thd(const char* name);
+
+/** Destroy a THD object associated with a background task.
+@param[in] thd MYSQL_THD to destroy */
+void destroy_background_thd(MYSQL_THD thd);
+
+/** Close opened tables, free memory, delete items for a MYSQL_THD.
+@param[in] thd MYSQL_THD to reset */
+void
+innobase_reset_background_thd(MYSQL_THD);
+
+#ifdef WITH_WSREP
+/** Append table-level exclusive key.
+@param thd MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+struct dict_table_t;
+bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table);
+#endif /* WITH_WSREP */
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
new file mode 100644
index 00000000..add983a0
--- /dev/null
+++ b/storage/innobase/include/handler0alter.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+#include "rem0types.h"
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+void
+innobase_rec_to_mysql(
+/*==================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(
+ rec, index, ...) */
+ MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+void
+innobase_fields_to_mysql(
+/*=====================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const dict_index_t* index, /*!< in: InnoDB index */
+ const dfield_t* fields) /*!< in: InnoDB index fields */
+ MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+void
+innobase_row_to_mysql(
+/*==================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const dict_table_t* itab, /*!< in: InnoDB table */
+ const dtuple_t* row) /*!< in: InnoDB row */
+ MY_ATTRIBUTE((nonnull));
+
+/** Generate the next autoinc based on a snapshot of the session
+auto_increment_increment and auto_increment_offset variables. */
+struct ib_sequence_t {
+
+ /**
+ @param thd the session
+ @param start_value the lower bound
+ @param max_value the upper bound (inclusive) */
+ ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value);
+
+ /** Postfix increment
+ @return the value to insert */
+ ulonglong operator++(int) UNIV_NOTHROW;
+
+ /** Check if the autoinc "sequence" is exhausted.
+ @return true if the sequence is exhausted */
+ bool eof() const UNIV_NOTHROW
+ {
+ return(m_eof);
+ }
+
+ /**
+ @return the next value in the sequence */
+ ulonglong last() const UNIV_NOTHROW
+ {
+ ut_ad(m_next_value > 0);
+
+ return(m_next_value);
+ }
+
+ /** @return maximum column value
+ @retval 0 if not adding AUTO_INCREMENT column */
+ ulonglong max_value() const { return m_max_value; }
+
+private:
+ /** Maximum value if adding an AUTO_INCREMENT column, else 0 */
+ ulonglong m_max_value;
+
+ /** Value of auto_increment_increment */
+ ulong m_increment;
+
+ /** Value of auto_increment_offset */
+ ulong m_offset;
+
+ /** Next value in the sequence */
+ ulonglong m_next_value;
+
+ /** true if no more values left in the sequence */
+ bool m_eof;
+};
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
new file mode 100644
index 00000000..867ad9e0
--- /dev/null
+++ b/storage/innobase/include/hash0hash.h
@@ -0,0 +1,190 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "ut0rnd.h"
+#include "ut0new.h"
+
+struct hash_table_t;
+struct hash_cell_t
+{
+ /** singly-linked, nullptr terminated list of hash buckets */
+ void *node;
+
+ /** Append an element.
+ @tparam T type of the element
+ @param insert the being-inserted element
+ @param next the next-element pointer in T */
+ template<typename T>
+ void append(T &insert, T *T::*next)
+ {
+ void **after;
+ for (after= &node; *after;
+ after= reinterpret_cast<void**>(&(static_cast<T*>(*after)->*next)));
+ insert.*next= nullptr;
+ *after= &insert;
+ }
+};
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ (DATA)->NAME = NULL;\
+\
+ cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+\
+ if (cell3333->node == NULL) {\
+ cell3333->node = DATA;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != NULL) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ }\
+\
+ struct3333->NAME = DATA;\
+ }\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+\
+ if (cell3333->node == DATA) {\
+ HASH_ASSERT_VALID(DATA->NAME);\
+ cell3333->node = DATA->NAME;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != DATA) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ ut_a(struct3333);\
+ }\
+\
+ struct3333->NAME = DATA->NAME;\
+ }\
+ HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \
+ HASH_ASSERT_VALID(DATA);\
+\
+ while ((DATA) != NULL) {\
+ ASSERTION;\
+ if (TEST) {\
+ break;\
+ } else {\
+ HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+ }\
+ }\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \
+do { \
+ ulint i3333; \
+ \
+ for (i3333 = (TABLE)->n_cells; i3333--; ) { \
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \
+ \
+ while ((DATA) != NULL) { \
+ HASH_ASSERT_VALID(DATA); \
+ ASSERTION; \
+ \
+ if (TEST) { \
+ break; \
+ } \
+ \
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \
+ } \
+ \
+ if ((DATA) != NULL) { \
+ break; \
+ } \
+ } \
+} while (0)
+
+/** Hash table with singly-linked overflow lists */
+struct hash_table_t
+{
+ /** number of elements in array (a prime number) */
+ ulint n_cells;
+ /** the hash array */
+ hash_cell_t *array;
+
+ /** Create the hash table.
+ @param n the lower bound of n_cells */
+ void create(ulint n)
+ {
+ n_cells= ut_find_prime(n);
+ array= static_cast<hash_cell_t*>(ut_zalloc_nokey(n_cells * sizeof *array));
+ }
+
+ /** Clear the hash table. */
+ void clear() { memset(array, 0, n_cells * sizeof *array); }
+
+ /** Free the hash table. */
+ void free() { ut_free(array); array= nullptr; }
+
+ ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); }
+};
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000..c246b2ef
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,436 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE (25)
+
+/* Possible operations buffered in the insert/whatever buffer. See
+ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
+typedef enum {
+ IBUF_OP_INSERT = 0,
+ IBUF_OP_DELETE_MARK = 1,
+ IBUF_OP_DELETE = 2,
+
+ /* Number of different operation types. */
+ IBUF_OP_COUNT = 3
+} ibuf_op_t;
+
+/** Combinations of operations that can be buffered.
+@see innodb_change_buffering_names */
+enum ibuf_use_t {
+ IBUF_USE_NONE = 0,
+ IBUF_USE_INSERT, /* insert */
+ IBUF_USE_DELETE_MARK, /* delete */
+ IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */
+ IBUF_USE_DELETE, /* delete+purge */
+ IBUF_USE_ALL /* insert+delete+purge */
+};
+
+/** Operations that can currently be buffered. */
+extern ulong innodb_change_buffering;
+
+/** Insert buffer struct */
+struct ibuf_t{
+ Atomic_relaxed<ulint> size; /*!< current size of the ibuf index
+ tree, in pages */
+ Atomic_relaxed<ulint> max_size; /*!< recommended maximum size of the
+ ibuf index tree, in pages */
+ ulint seg_size; /*!< allocated pages of the file
+ segment containing ibuf header and
+ tree */
+ bool empty; /*!< Protected by the page
+ latch of the root page of the
+ insert buffer tree
+ (FSP_IBUF_TREE_ROOT_PAGE_NO). true
+ if and only if the insert
+ buffer tree is empty. */
+ ulint free_list_len; /*!< length of the free list */
+ ulint height; /*!< tree height */
+ dict_index_t* index; /*!< insert buffer index */
+
+ /** number of pages merged */
+ Atomic_counter<ulint> n_merges;
+ Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT];
+ /*!< number of operations of each type
+ merged to index pages */
+ Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT];
+ /*!< number of operations of each type
+ discarded without merging due to the
+ tablespace being deleted or the
+ index being dropped */
+};
+
+/** The insert buffer control structure */
+extern ibuf_t ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no). When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation. This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed. To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations. Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments. The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page. It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space. It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup.
+@return DB_SUCCESS or failure */
+dberr_t
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+void
+ibuf_max_size_update(
+/*=================*/
+ ulint new_val); /*!< in: new value in terms of
+ percentage of the buffer pool size */
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+ mtr_t* mtr) /*!< out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+void
+ibuf_reset_free_bits(
+/*=================*/
+ buf_block_t* block); /*!< in: index page; free bits are set to 0
+ if the index is a non-clustered
+ non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase);/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ ulint max_ins_size, /*!< in: value of
+ maximum insert size
+ with reorganize before
+ the latest operation
+ performed to the page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+ buf_block_t* block, /*!< in/out: index page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page. It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ buf_block_t* block1, /*!< in: index page */
+ buf_block_t* block2, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique); /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+ const mtr_t* mtr) /*!< in: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return TRUE if a bitmap page */
+inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size)
+{
+ ut_ad(ut_is_2pow(zip_size));
+ ulint size = zip_size ? zip_size : srv_page_size;
+ return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET;
+}
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] x_latch FALSE if relaxed check (avoid latching the
+bitmap page)
+@param[in,out] mtr mtr which will contain an x-latch to the
+bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
+in which case a new transaction is created.
+@return true if level 2 or level 3 page */
+bool
+ibuf_page_low(
+ const page_id_t page_id,
+ ulint zip_size,
+#ifdef UNIV_DEBUG
+ bool x_latch,
+#endif /* UNIV_DEBUG */
+ mtr_t* mtr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef UNIV_DEBUG
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id tablespace/page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr) \
+ ibuf_page_low(page_id, zip_size, true, mtr)
+
+#else /* UNIV_DEBUG */
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id tablespace/page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr) \
+ ibuf_page_low(page_id, zip_size, mtr)
+
+#endif /* UNIV_DEBUG */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+
+/** Buffer an operation in the change buffer, instead of applying it
+directly to the file page, if this is possible. Does not do it if the index
+is clustered or unique.
+@param[in] op operation type
+@param[in] entry index entry to insert
+@param[in,out] index index where to insert
+@param[in] page_id page id where to insert
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] thr query thread
+@return true if success */
+bool
+ibuf_insert(
+ ibuf_op_t op,
+ const dtuple_t* entry,
+ dict_index_t* index,
+ const page_id_t page_id,
+ ulint zip_size,
+ que_thr_t* thr);
+
+/** Check whether buffered changes exist for a page.
+@param[in] id page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size);
+
+/** When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped.
+@param block X-latched page to try to apply changes to, or NULL to discard
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+ const page_id_t page_id,
+ ulint zip_size);
+
+/** Delete all change buffer entries for a tablespace,
+in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
+@param[in] space missing or to-be-discarded tablespace */
+void ibuf_delete_for_discarded_space(uint32_t space);
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read
+@retval 0 if ibuf.empty */
+ulint ibuf_contract();
+
+/** Contracts insert buffer trees by reading pages referring to space_id
+to the buffer pool.
+@returns number of pages merged.*/
+ulint
+ibuf_merge_space(
+/*=============*/
+ ulint space); /*!< in: space id */
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return true if empty */
+bool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+void
+ibuf_print(
+/*=======*/
+ FILE* file); /*!< in: file where to print */
+/********************************************************************
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+ const rec_t* rec); /*!< in: ibuf record */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+void
+ibuf_close(void);
+/*============*/
+
+/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
+@param[in] trx transaction
+@param[in,out] space tablespace being imported
+@return DB_SUCCESS or error code */
+dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update free bits and buffered bits for bulk loaded page.
+@param block secondary index leaf page
+@param mtr mini-transaction
+@param reset whether the page is full */
+void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset);
+
+#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER PAGE_DATA
+#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID static_cast<ulint>(0)
+
+#include "ibuf0ibuf.inl"
+
+#endif
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
new file mode 100644
index 00000000..003bf22a
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.inl
@@ -0,0 +1,282 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "buf0lru.h"
+
+/** An index page must contain at least srv_page_size /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page. If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
+
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+ mtr_t* mtr) /*!< out: mini-transaction */
+{
+ mtr_start(mtr);
+ mtr->enter_ibuf();
+
+ if (high_level_read_only || srv_read_only_mode) {
+ mtr_set_log_mode(mtr, MTR_LOG_NO_REDO);
+ }
+
+}
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->is_inside_ibuf());
+ ut_d(mtr->exit_ibuf());
+
+ mtr_commit(mtr);
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+void
+ibuf_set_free_bits_func(
+/*====================*/
+ buf_block_t* block, /*!< in: index page of a non-clustered index;
+ free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+ ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
+ value which the bits must have before
+ setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+ ulint val); /*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique) /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+{
+ if (index->type & (DICT_CLUSTERED | DICT_IBUF | DICT_SPATIAL) ||
+ !innodb_change_buffering || !ibuf.max_size)
+ return false;
+ if (!ignore_sec_unique && index->is_unique())
+ return false;
+ if (index->table->quiesce != QUIESCE_NONE)
+ return false;
+ for (unsigned i= 0; i < index->n_fields; i++)
+ if (index->fields[i].descending)
+ return false;
+ return true;
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+ const mtr_t* mtr) /*!< in: mini-transaction */
+{
+ return(mtr->is_inside_ibuf());
+}
+
+/** Translates the free space on a page to a value in the ibuf bitmap.
+@param[in] page_size page size in bytes
+@param[in] max_ins_size maximum insert size after reorganize for
+the page
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+ ulint page_size,
+ ulint max_ins_size)
+{
+ ulint n;
+ ut_ad(ut_is_2pow(page_size));
+ ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+ n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+ if (n == 3) {
+ n = 2;
+ }
+
+ if (n > 3) {
+ n = 3;
+ }
+
+ return(n);
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ulint max_ins_size;
+ const page_zip_des_t* page_zip;
+ lint zip_max_ins;
+
+ ut_ad(block->page.zip.data);
+
+ /* Consider the maximum insert size on the uncompressed page
+ without reorganizing the page. We must not assume anything
+ about the compression ratio. If zip_max_ins > max_ins_size and
+ there is 1/4 garbage on the page, recompression after the
+ reorganize could fail, in theory. So, let us guarantee that
+ merging a buffered insert to a compressed page will always
+ succeed without reorganizing or recompressing the page, just
+ by using the page modification log. */
+ max_ins_size = page_get_max_insert_size(
+ buf_block_get_frame(block), 1);
+
+ page_zip = buf_block_get_page_zip(block);
+ zip_max_ins = page_zip_max_ins_size(page_zip,
+ FALSE/* not clustered */);
+
+ if (zip_max_ins < 0) {
+ return(0);
+ } else if (max_ins_size > (ulint) zip_max_ins) {
+ max_ins_size = (ulint) zip_max_ins;
+ }
+
+ return(ibuf_index_page_calc_free_bits(block->physical_size(),
+ max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ if (!block->page.zip.data) {
+ ulint max_ins_size;
+
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ buf_block_get_frame(block), 1);
+
+ return(ibuf_index_page_calc_free_bits(
+ block->physical_size(), max_ins_size));
+ } else {
+ return(ibuf_index_page_calc_free_zip(block));
+ }
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase)/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+{
+ ulint before;
+ ulint after;
+
+ ut_ad(buf_block_get_page_zip(block) == NULL);
+
+ before = ibuf_index_page_calc_free_bits(
+ srv_page_size, max_ins_size);
+
+ if (max_ins_size >= increase) {
+ compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX);
+ after = ibuf_index_page_calc_free_bits(
+ srv_page_size, max_ins_size - increase);
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(after <= ibuf_index_page_calc_free(block));
+#endif
+ } else {
+ after = ibuf_index_page_calc_free(block);
+ }
+
+ if (after == 0) {
+ /* We move the page to the front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(&block->page);
+ }
+
+ if (before > after) {
+ ibuf_set_free_bits(block, after, before);
+ }
+}
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
new file mode 100644
index 00000000..a7e61395
--- /dev/null
+++ b/storage/innobase/include/lock0iter.h
@@ -0,0 +1,66 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "lock0types.h"
+
+struct lock_queue_iterator_t {
+ const lock_t* current_lock;
+ /* In case this is a record lock queue (not table lock queue)
+ then bit_no is the record number within the heap in which the
+ record is stored. */
+ ulint bit_no;
+};
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+ bit_no is calculated in this function by using
+ lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+ of a wait lock. */
+void
+lock_queue_iterator_reset(
+/*======================*/
+ lock_queue_iterator_t* iter, /*!< out: iterator */
+ const lock_t* lock, /*!< in: lock to start from */
+ ulint bit_no);/*!< in: record number in the
+ heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+ lock_queue_iterator_t* iter); /*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
new file mode 100644
index 00000000..59ee7f55
--- /dev/null
+++ b/storage/innobase/include/lock0lock.h
@@ -0,0 +1,1271 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2022, Oracle and/or its affiliates.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "buf0types.h"
+#include "trx0trx.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "hash0hash.h"
+#include "srv0srv.h"
+#include "ut0vec.h"
+#include "gis0rtree.h"
+#include "lock0prdt.h"
+#include "transactional_lock_guard.h"
+
+// Forward declaration
+class ReadView;
+
+/** The value of innodb_deadlock_detect */
+extern my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+extern ulong innodb_deadlock_report;
+
+namespace Deadlock
+{
+ /** The allowed values of innodb_deadlock_report */
+ enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block); /*!< in: buffer block */
+
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+void
+lock_move_reorganize_page(
+/*======================*/
+ const buf_block_t* block, /*!< in: old index page, now
+ reorganized */
+ const buf_block_t* oblock);/*!< in: copy of the old, not
+ reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+void
+lock_move_rec_list_end(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec); /*!< in: record on page: this
+ is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_move_rec_list_start(
+/*=====================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec, /*!< in: record on page:
+ this is the first
+ record NOT copied */
+ const rec_t* old_end); /*!< in: old
+ previous-to-last
+ record on new_page
+ before the records
+ were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+void
+lock_update_split_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+void
+lock_update_merge_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page to
+ which merged */
+ const rec_t* orig_succ, /*!< in: original
+ successor of infimum
+ on the right page
+ before merge */
+ const buf_block_t* left_block); /*!< in: merged index
+ page which will be
+ discarded */
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root);
+/** Update the lock table when a page is copied to another.
+@param new_block the target page
+@param old old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old);
+
+/** Update gap locks between the last record of the left_block and the
+first record of the right_block when a record is about to be inserted
+at the start of the right_block, even though it should "naturally" be
+inserted as the last record of the left_block according to the
+current node pointer in the parent page.
+
+That is, we assume that the lowest common ancestor of the left_block
+and right_block routes the key of the new record to the left_block,
+but a heuristic which tries to avoid overflowing left_block has chosen
+to insert the record into right_block instead. Said ancestor performs
+this routing by comparing the key of the record to a "split point" -
+all records greater or equal to than the split point (node pointer)
+are in right_block, and smaller ones in left_block.
+The split point may be smaller than the smallest key in right_block.
+
+The gap between the last record on the left_block and the first record
+on the right_block is represented as a gap lock attached to the supremum
+pseudo-record of left_block, and a gap lock attached to the new first
+record of right_block.
+
+Thus, inserting the new record, and subsequently adjusting the node
+pointers in parent pages to values smaller or equal to the new
+records' key, will mean that gap will be sliced at a different place
+("moved to the left"): fragment of the 1st gap will now become treated
+as 2nd. Therefore, we must copy any GRANTED locks from 1st gap to the
+2nd gap. Any WAITING locks must be of INSERT_INTENTION type (as no
+other GAP locks ever wait for anything) and can stay at 1st gap, as
+their only purpose is to notify the requester they can retry
+insertion, and there's no correctness requirement to avoid waking them
+up too soon.
+@param left_block left page
+@param right_block right page */
+void lock_update_node_pointer(const buf_block_t *left_block,
+ const buf_block_t *right_block);
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+void
+lock_update_split_left(
+/*===================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/** Update the lock table when a page is merged to the left.
+@param left left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+ const page_id_t right);
+
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ const buf_block_t& heir_block, /*!< in: block containing the
+ record which inherits */
+ const page_id_t donor, /*!< in: page containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no); /*!< in: heap_no of the
+ donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+void
+lock_update_discard(
+/*================*/
+ const buf_block_t* heir_block, /*!< in: index page
+ which will inherit the locks */
+ ulint heir_heap_no, /*!< in: heap_no of the record
+ which will inherit the locks */
+ const buf_block_t* block); /*!< in: index page
+ which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+void
+lock_update_insert(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+void
+lock_update_delete(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: record whose lock state
+ is stored on the infimum
+ record of the same page; lock
+ bits are reset on the
+ record */
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block buffer block containing rec
+@param rec record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+ const rec_t *rec, page_id_t donator);
+
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table table on which the lock is created
+@param type_mode lock type and mode
+@param trx transaction
+@param c_lock conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+ lock_t *c_lock= nullptr);
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ bool* inherit)/*!< out: set to true if the new
+ inserted record maybe should inherit
+ LOCK_GAP type locks from the successor
+ record */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified; NOTE: as this is a secondary
+ index, we always have to modify the
+ clustered index record first: see the
+ comment below */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr, /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Acquire a table lock.
+@param table table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode lock mode
+@param thr SQL execution thread
+@retval DB_SUCCESS if the lock was acquired
+@retval DB_DEADLOCK if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+ lock_mode mode, que_thr_t *thr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Create a table lock object for a resurrected transaction.
+@param table table to be X-locked
+@param trx transaction
+@param mode LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode);
+
+/** Sets a lock on a table based on the given mode.
+@param table table to lock
+@param trx transaction
+@param mode LOCK_X or LOCK_S
+@param no_wait whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+ bool no_wait= false)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Exclusively lock the data dictionary tables.
+@param trx dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx);
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+void
+lock_rec_unlock(
+/*============*/
+ trx_t* trx, /*!< in/out: transaction that has
+ set a record lock */
+ const page_id_t id, /*!< in: page containing rec */
+ const rec_t* rec, /*!< in: record */
+ lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks. */
+void lock_release(trx_t* trx);
+
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx);
+
+/** Release non-exclusive locks on XA PREPARE,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_prepare(trx_t *trx);
+
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table);
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+ const lock_t* lock); /*!< in: record lock with at least one
+ bit set */
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return whether lock1 has to wait for lock2 to be removed */
+bool
+lock_has_to_wait(
+/*=============*/
+ const lock_t* lock1, /*!< in: waiting lock */
+ const lock_t* lock2); /*!< in: another lock; NOTE that it is
+ assumed that this has a lock bit set
+ on the same record as in lock1 if the
+ locks are record locks */
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+ATTRIBUTE_COLD
+void
+lock_report_trx_id_insanity(
+/*========================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to acquire lock_sys.latch (and display info) */
+ibool
+lock_print_info_summary(
+/*====================*/
+ FILE* file, /*!< in: file where to print */
+ ibool nowait) /*!< in: whether to wait for lock_sys.latch */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Prints transaction lock wait and MVCC state.
+@param[in,out] file file where to print
+@param[in] trx transaction
+@param[in] now current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+ my_hrtime_t now);
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
+void
+lock_print_info_all_transactions(
+/*=============================*/
+ FILE* file); /*!< in: file where to print */
+
+/*********************************************************************//**
+Return the number of table locks for a transaction.
+The caller must be holding lock_sys.latch. */
+ulint
+lock_number_of_tables_locked(
+/*=========================*/
+ const trx_lock_t* trx_lock) /*!< in: transaction locks */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Check if there are any locks on a table.
+@return true if table has either table or record locks. */
+bool lock_table_has_locks(dict_table_t *table);
+
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr);
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+void
+lock_unlock_table_autoinc(
+/*======================*/
+ trx_t* trx); /*!< in/out: transaction */
+
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
+@param trx transaction that is or was waiting for a lock
+@retval DB_SUCCESS if the lock was granted
+@retval DB_DEADLOCK if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+ lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx);
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return true if ok */
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets); /*!< in: rec_get_offsets(rec, index) */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return the strongest lock found on any sys table or 0 for none */
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+ const trx_t* trx) /*!< in: transaction to check */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if the transaction holds an explicit exclusive lock on a record.
+@param[in] trx transaction
+@param[in] table table
+@param[in] id leaf page identifier
+@param[in] heap_no heap number identifying the record
+@return whether an explicit X-lock is held */
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+ page_id_t id, ulint heap_no);
+#endif /* UNIV_DEBUG */
+
+/** Lock operation struct */
+struct lock_op_t{
+ dict_table_t* table; /*!< table to be locked */
+ lock_mode mode; /*!< lock mode */
+};
+
+/** The lock system struct */
+class lock_sys_t
+{
+ friend struct LockGuard;
+ friend struct LockMultiGuard;
+ friend struct TMLockGuard;
+ friend struct TMLockMutexGuard;
+ friend struct TMLockTrxGuard;
+
+ /** Hash table latch */
+ struct hash_latch
+#ifdef SUX_LOCK_GENERIC
+ : private rw_lock
+ {
+ /** Wait for an exclusive lock */
+ void wait();
+ /** Try to acquire a lock */
+ bool try_acquire() { return write_trylock(); }
+ /** Acquire a lock */
+ void acquire() { if (!try_acquire()) wait(); }
+ /** Release a lock */
+ void release();
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const
+ { return rw_lock::is_locked_or_waiting(); }
+ /** @return whether this latch is possibly held by any thread */
+ bool is_locked() const { return rw_lock::is_locked(); }
+#else
+ {
+ private:
+ srw_spin_lock_low lock;
+ public:
+ /** Try to acquire a lock */
+ bool try_acquire() { return lock.wr_lock_try(); }
+ /** Acquire a lock */
+ void acquire() { lock.wr_lock(); }
+ /** Release a lock */
+ void release() { lock.wr_unlock(); }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return lock.is_locked_or_waiting(); }
+ /** @return whether this latch is possibly held by any thread */
+ bool is_locked() const noexcept { return lock.is_locked(); }
+#endif
+ };
+
+public:
+ struct hash_table
+ {
+ /** Number of consecutive array[] elements occupied by a hash_latch */
+ static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2;
+ static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation");
+
+ /** Number of array[] elements per hash_latch.
+ Must be LATCH less than a power of 2. */
+ static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH;
+ static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+ ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+ /** number of payload elements in array[]. Protected by lock_sys.latch. */
+ ulint n_cells;
+ /** the hash table, with pad(n_cells) elements, aligned to L1 cache size;
+ in any hash chain, lock_t::is_waiting() entries must not precede
+ granted locks */
+ hash_cell_t *array;
+
+ /** Create the hash table.
+ @param n the lower bound of n_cells */
+ void create(ulint n);
+
+ /** Resize the hash table.
+ @param n the lower bound of n_cells */
+ void resize(ulint n);
+
+ /** Free the hash table. */
+ void free() { aligned_free(array); array= nullptr; }
+
+ /** @return the index of an array element */
+ inline ulint calc_hash(ulint fold) const;
+
+ /** @return raw array index converted to padded index */
+ static ulint pad(ulint h)
+ {
+ ulint latches= LATCH * (h / ELEMENTS_PER_LATCH);
+ ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH;
+ return LATCH + latches + empty_slots + h;
+ }
+
+ /** Get a latch. */
+ static hash_latch *latch(hash_cell_t *cell)
+ {
+ void *l= ut_align_down(cell, sizeof *cell *
+ (ELEMENTS_PER_LATCH + LATCH));
+ return static_cast<hash_latch*>(l);
+ }
+ /** Get a hash table cell. */
+ inline hash_cell_t *cell_get(ulint fold) const;
+
+#ifdef UNIV_DEBUG
+ void assert_locked(const page_id_t id) const;
+#else
+ void assert_locked(const page_id_t) const {}
+#endif
+
+ private:
+ /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+ static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+ /** @return the index of an array element */
+ static ulint calc_hash(ulint fold, ulint n_cells)
+ {
+ return pad(hash(fold, n_cells));
+ }
+ };
+
+private:
+ bool m_initialised;
+
+ /** mutex proteting the locks */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
+#ifdef UNIV_DEBUG
+ /** The owner of exclusive latch (0 if none); protected by latch */
+ std::atomic<pthread_t> writer{0};
+ /** Number of shared latches */
+ std::atomic<ulint> readers{0};
+#endif
+#ifdef SUX_LOCK_GENERIC
+protected:
+ /** mutex for hash_latch::wait() */
+ pthread_mutex_t hash_mutex;
+ /** condition variable for hash_latch::wait() */
+ pthread_cond_t hash_cond;
+#endif
+public:
+ /** record locks */
+ hash_table rec_hash;
+ /** predicate locks for SPATIAL INDEX */
+ hash_table prdt_hash;
+ /** page locks for SPATIAL INDEX */
+ hash_table prdt_page_hash;
+
+ /** mutex covering lock waits; @see trx_lock_t::wait_lock */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
+private:
+ /** The increment of wait_count for a wait. Anything smaller is a
+ pending wait count. */
+ static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19;
+ /** waits and total number of lock waits; protected by wait_mutex */
+ uint64_t wait_count;
+ /** Cumulative wait time; protected by wait_mutex */
+ uint64_t wait_time;
+ /** Longest wait time; protected by wait_mutex */
+ uint64_t wait_time_max;
+public:
+ /** number of deadlocks detected; protected by wait_mutex */
+ ulint deadlocks;
+ /** number of lock wait timeouts; protected by wait_mutex */
+ ulint timeouts;
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+ lock_sys_t(): m_initialised(false) {}
+
+
+ bool is_initialised() const { return m_initialised; }
+
+#ifdef UNIV_PFS_RWLOCK
+ /** Acquire exclusive lock_sys.latch */
+ ATTRIBUTE_NOINLINE
+ void wr_lock(const char *file, unsigned line);
+ /** Release exclusive lock_sys.latch */
+ ATTRIBUTE_NOINLINE void wr_unlock();
+ /** Acquire shared lock_sys.latch */
+ ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line);
+ /** Release shared lock_sys.latch */
+ ATTRIBUTE_NOINLINE void rd_unlock();
+#else
+ /** Acquire exclusive lock_sys.latch */
+ void wr_lock()
+ {
+ mysql_mutex_assert_not_owner(&wait_mutex);
+ ut_ad(!is_writer());
+ latch.wr_lock();
+ ut_ad(!writer.exchange(pthread_self(),
+ std::memory_order_relaxed));
+ }
+ /** Release exclusive lock_sys.latch */
+ void wr_unlock()
+ {
+ ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+ pthread_self());
+ latch.wr_unlock();
+ }
+ /** Acquire shared lock_sys.latch */
+ void rd_lock()
+ {
+ mysql_mutex_assert_not_owner(&wait_mutex);
+ ut_ad(!is_writer());
+ latch.rd_lock();
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+ }
+ /** Release shared lock_sys.latch */
+ void rd_unlock()
+ {
+ ut_ad(!is_writer());
+ ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+ latch.rd_unlock();
+ }
+#endif
+ /** Try to acquire exclusive lock_sys.latch
+ @return whether the latch was acquired */
+ bool wr_lock_try()
+ {
+ ut_ad(!is_writer());
+ if (!latch.wr_lock_try()) return false;
+ ut_ad(!writer.exchange(pthread_self(),
+ std::memory_order_relaxed));
+ return true;
+ }
+ /** Try to acquire shared lock_sys.latch
+ @return whether the latch was acquired */
+ bool rd_lock_try()
+ {
+ ut_ad(!is_writer());
+ if (!latch.rd_lock_try()) return false;
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+ return true;
+ }
+
+ /** Assert that wr_lock() has been invoked by this thread */
+ void assert_locked() const { ut_ad(is_writer()); }
+ /** Assert that wr_lock() has not been invoked by this thread */
+ void assert_unlocked() const { ut_ad(!is_writer()); }
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread is the lock_sys.latch writer */
+ bool is_writer() const
+ {
+# ifdef SUX_LOCK_GENERIC
+ return writer.load(std::memory_order_relaxed) == pthread_self();
+# else
+ return writer.load(std::memory_order_relaxed) == pthread_self() ||
+ (xtest() && !latch.is_locked_or_waiting());
+# endif
+ }
+ /** Assert that a lock shard is exclusively latched (by some thread) */
+ void assert_locked(const lock_t &lock) const;
+ /** Assert that a table lock shard is exclusively latched by this thread */
+ void assert_locked(const dict_table_t &table) const;
+ /** Assert that a hash table cell is exclusively latched (by some thread) */
+ void assert_locked(const hash_cell_t &cell) const;
+#else
+ void assert_locked(const lock_t &) const {}
+ void assert_locked(const dict_table_t &) const {}
+ void assert_locked(const hash_cell_t &) const {}
+#endif
+
+ /**
+ Creates the lock system at database start.
+
+ @param[in] n_cells number of slots in lock hash table
+ */
+ void create(ulint n_cells);
+
+
+ /**
+ Resize the lock hash table.
+
+ @param[in] n_cells number of slots in lock hash table
+ */
+ void resize(ulint n_cells);
+
+
+ /** Closes the lock system at database shutdown. */
+ void close();
+
+
+ /** Check for deadlocks while holding only lock_sys.wait_mutex. */
+ void deadlock_check();
+
+ /** Cancel a waiting lock request.
+ @tparam check_victim whether to check for DB_DEADLOCK
+ @param trx active transaction
+ @param lock waiting lock request
+ @retval DB_SUCCESS if no lock existed
+ @retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set
+ @retval DB_LOCK_WAIT if the lock was canceled */
+ template<bool check_victim>
+ static dberr_t cancel(trx_t *trx, lock_t *lock);
+
+ /** Note that a record lock wait started */
+ inline void wait_start();
+
+ /** Note that a record lock wait resumed */
+ inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now);
+
+ /** @return pending number of lock waits */
+ ulint get_wait_pending() const
+ {
+ return static_cast<ulint>(wait_count & (WAIT_COUNT_STEP - 1));
+ }
+ /** @return cumulative number of lock waits */
+ ulint get_wait_cumulative() const
+ { return static_cast<ulint>(wait_count / WAIT_COUNT_STEP); }
+ /** Cumulative wait time; protected by wait_mutex */
+ uint64_t get_wait_time_cumulative() const { return wait_time; }
+ /** Longest wait time; protected by wait_mutex */
+ uint64_t get_wait_time_max() const { return wait_time_max; }
+
+ /** Get the lock hash table for a mode */
+ hash_table &hash_get(ulint mode)
+ {
+ if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))))
+ return rec_hash;
+ return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash;
+ }
+
+ /** Get the lock hash table for predicate a mode */
+ hash_table &prdt_hash_get(bool page)
+ { return page ? prdt_page_hash : prdt_hash; }
+
+ /** Get the first lock on a page.
+ @param cell hash table cell
+ @param id page number
+ @return first lock
+ @retval nullptr if none exists */
+ static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id);
+
+ /** Get the first explicit lock request on a record.
+ @param cell first lock hash table cell
+ @param id page identifier
+ @param heap_no record identifier in page
+ @return first lock
+ @retval nullptr if none exists */
+ static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id,
+ ulint heap_no);
+
+ /** Remove locks on a discarded SPATIAL INDEX page.
+ @param id page to be discarded
+ @param page whether to discard also from lock_sys.prdt_hash */
+ void prdt_page_free_from_discard(const page_id_t id, bool all= false);
+
+ /** Cancel possible lock waiting for a transaction */
+ static void cancel_lock_wait_for_trx(trx_t *trx);
+#ifdef WITH_WSREP
+ /** Cancel lock waiting for a wsrep BF abort. */
+ static void cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx);
+#endif /* WITH_WSREP */
+};
+
+/** The lock system */
+extern lock_sys_t lock_sys;
+
+/** @return the index of an array element */
+inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
+{
+ ut_ad(lock_sys.is_writer() || lock_sys.readers);
+ return calc_hash(fold, n_cells);
+}
+
+/** Get a hash table cell. */
+inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
+{
+ ut_ad(lock_sys.is_writer() || lock_sys.readers);
+ return &array[calc_hash(fold)];
+}
+
+/** Get the first lock on a page.
+@param cell hash table cell
+@param id page number
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id)
+{
+ lock_sys.assert_locked(cell);
+ for (auto lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+ {
+ ut_ad(!lock->is_table());
+ if (lock->un_member.rec_lock.page_id == id)
+ return lock;
+ }
+ return nullptr;
+}
+
+/** lock_sys.latch exclusive guard */
+struct LockMutexGuard
+{
+ LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+ { lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); }
+ ~LockMutexGuard() { lock_sys.wr_unlock(); }
+};
+
+/** lock_sys latch guard for 1 page_id_t */
+struct LockGuard
+{
+ LockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+ ~LockGuard()
+ {
+ lock_sys_t::hash_table::latch(cell_)->release();
+ /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+ lock_sys.rd_unlock();
+ }
+ /** @return the hash array cell */
+ hash_cell_t &cell() const { return *cell_; }
+private:
+ /** The hash array cell */
+ hash_cell_t *cell_;
+};
+
+/** lock_sys latch guard for 2 page_id_t */
+struct LockMultiGuard
+{
+ LockMultiGuard(lock_sys_t::hash_table &hash,
+ const page_id_t id1, const page_id_t id2);
+ ~LockMultiGuard();
+
+ /** @return the first hash array cell */
+ hash_cell_t &cell1() const { return *cell1_; }
+ /** @return the second hash array cell */
+ hash_cell_t &cell2() const { return *cell2_; }
+private:
+ /** The first hash array cell */
+ hash_cell_t *cell1_;
+ /** The second hash array cell */
+ hash_cell_t *cell2_;
+};
+
+/** lock_sys.latch exclusive guard using transactional memory */
+struct TMLockMutexGuard
+{
+ TRANSACTIONAL_INLINE
+ TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
+ }
+ TRANSACTIONAL_INLINE
+ ~TMLockMutexGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided()) xend(); else
+#endif
+ lock_sys.wr_unlock();
+ }
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept
+ { return !lock_sys.latch.is_locked_or_waiting(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** lock_sys latch guard for 1 page_id_t, using transactional memory */
+struct TMLockGuard
+{
+ TRANSACTIONAL_TARGET
+ TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+ TRANSACTIONAL_INLINE ~TMLockGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (elided)
+ {
+ xend();
+ return;
+ }
+#endif
+ lock_sys_t::hash_table::latch(cell_)->release();
+ /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+ lock_sys.rd_unlock();
+ }
+ /** @return the hash array cell */
+ hash_cell_t &cell() const { return *cell_; }
+private:
+ /** The hash array cell */
+ hash_cell_t *cell_;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ /** whether the latches were elided */
+ bool elided;
+#endif
+};
+
+/** guard for shared lock_sys.latch and trx_t::mutex using
+transactional memory */
+struct TMLockTrxGuard
+{
+ trx_t &trx;
+
+ TRANSACTIONAL_INLINE
+#ifndef UNIV_PFS_RWLOCK
+ TMLockTrxGuard(trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) trx
+#else
+ TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
+#endif
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (!lock_sys.latch.is_write_locked() && was_elided())
+ return;
+ xabort();
+ }
+#endif
+ lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
+ trx.mutex_lock();
+ }
+ TRANSACTIONAL_INLINE
+ ~TMLockTrxGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided())
+ {
+ xend();
+ return;
+ }
+#endif
+ lock_sys.rd_unlock();
+ trx.mutex_unlock();
+ }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** guard for trx_t::mutex using transactional memory */
+struct TMTrxGuard
+{
+ trx_t &trx;
+
+ TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ trx.mutex_lock();
+ }
+ TRANSACTIONAL_INLINE ~TMTrxGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided())
+ {
+ xend();
+ return;
+ }
+#endif
+ trx.mutex_unlock();
+ }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+ lock_t* c_lock, /*!< conflicting lock */
+ unsigned type_mode,/*!< in: lock mode and wait flag */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in,out: transaction */
+ bool caller_owns_trx_mutex);
+ /*!< in: true if caller owns
+ trx mutex */
+
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash hash table
+@param in_lock lock object */
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
+
+/** Create a new record lock and inserts it to the lock queue,
+without checking for deadlocks or conflicts.
+@param[in] c_lock conflicting lock, or NULL
+@param[in] type_mode lock mode and wait flag
+@param[in] page_id index page number
+@param[in] page R-tree index page, or NULL
+@param[in] heap_no record heap number in the index page
+@param[in] index the index tree
+@param[in,out] trx transaction
+@param[in] holds_trx_mutex whether the caller holds trx->mutex
+@return created lock */
+lock_t*
+lock_rec_create_low(
+ lock_t* c_lock,
+ unsigned type_mode,
+ const page_id_t page_id,
+ const page_t* page,
+ ulint heap_no,
+ dict_index_t* index,
+ trx_t* trx,
+ bool holds_trx_mutex);
+
+/** Enqueue a waiting request for a lock which cannot be granted immediately.
+Check for deadlocks.
+@param[in] c_lock conflicting lock
+@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X)
+ possibly ORed with LOCK_GAP or
+ LOCK_REC_NOT_GAP, ORed with
+ LOCK_INSERT_INTENTION if this
+ waiting lock request is set
+ when performing an insert of
+ an index record
+@param[in] id page identifier
+@param[in] page leaf page in the index
+@param[in] heap_no record heap number in the block
+@param[in] index index tree
+@param[in,out] thr query thread
+@param[in] prdt minimum bounding box (spatial index)
+@retval DB_LOCK_WAIT if the waiting lock was enqueued
+@retval DB_DEADLOCK if this transaction was chosen as the victim */
+dberr_t
+lock_rec_enqueue_waiting(
+ lock_t* c_lock,
+ unsigned type_mode,
+ const page_id_t id,
+ const page_t* page,
+ ulint heap_no,
+ dict_index_t* index,
+ que_thr_t* thr,
+ lock_prdt_t* prdt);
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_rtr_move_rec_list(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ move to */
+ const buf_block_t* block, /*!< in: index page */
+ rtr_rec_move_t* rec_move, /*!< in: recording records
+ moved */
+ ulint num_move); /*!< in: num of rec to move */
+
+#include "lock0lock.inl"
+
+#endif
diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl
new file mode 100644
index 00000000..1b9255ff
--- /dev/null
+++ b/storage/innobase/include/lock0lock.inl
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "page0page.h"
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ const page_t* page = block->page.frame;
+
+ if (page_is_comp(page)) {
+ return(rec_get_heap_no_new(
+ page
+ + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE)));
+ } else {
+ return(rec_get_heap_no_old(
+ page
+ + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+ FALSE)));
+ }
+}
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+ lock_t* c_lock, /*!< conflicting lock */
+ unsigned type_mode,/*!< in: lock mode and wait flag */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in,out: transaction */
+ bool caller_owns_trx_mutex)
+ /*!< in: TRUE if caller owns
+ trx mutex */
+{
+ return lock_rec_create_low(
+ c_lock,
+ type_mode, block->page.id(), block->page.frame, heap_no,
+ index, trx, caller_owns_trx_mutex);
+}
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
new file mode 100644
index 00000000..db8e3392
--- /dev/null
+++ b/storage/innobase/include/lock0prdt.h
@@ -0,0 +1,192 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0prdt.h
+The predicate lock system
+
+Created 9/7/2013 Jimmy Yang
+*******************************************************/
+#ifndef lock0prdt_h
+#define lock0prdt_h
+
+#include "lock0lock.h"
+
+/* Predicate lock data */
+typedef struct lock_prdt {
+ void* data; /* Predicate data */
+ uint16 op; /* Predicate operator */
+} lock_prdt_t;
+
+/*********************************************************************//**
+Acquire a predicate lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_lock(
+/*===========*/
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ lock_prdt_t* prdt, /*!< in: Predicate for the lock */
+ dict_index_t* index, /*!< in: secondary index */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned type_mode,
+ /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
+ que_thr_t* thr); /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+
+/*********************************************************************//**
+Acquire a "Page" lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_place_prdt_page_lock(
+ const page_id_t page_id, /*!< in: page identifier */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr); /*!< in: query thread */
+
+/*********************************************************************//**
+Initiate a Predicate lock from a MBR */
+void
+lock_init_prdt_from_mbr(
+/*====================*/
+ lock_prdt_t* prdt, /*!< in/out: predicate to initialized */
+ rtr_mbr_t* mbr, /*!< in: Minimum Bounding Rectangle */
+ ulint mode, /*!< in: Search mode */
+ mem_heap_t* heap); /*!< in: heap for allocating memory */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+lock_prdt_t*
+lock_get_prdt_from_lock(
+/*====================*/
+ const lock_t* lock); /*!< in: the lock */
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+ const trx_t* trx, /*!< in: trx of new lock */
+ unsigned type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+ LOCK_INSERT_INTENTION */
+ lock_prdt_t* prdt, /*!< in: lock predicate to check */
+ const lock_t* lock2); /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+
+/**************************************************************//**
+Update predicate lock when page splits */
+void
+lock_prdt_update_split(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ lock_prdt_t* prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id); /*!< in: page number */
+
+/**************************************************************//**
+Ajust locks from an ancester page of Rtree on the appropriate level . */
+void
+lock_prdt_update_parent(
+/*====================*/
+ buf_block_t* left_block, /*!< in/out: page to be split */
+ buf_block_t* right_block, /*!< in/out: the new half page */
+ lock_prdt_t* left_prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* right_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id); /*!< in: parent page */
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a predicate record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_insert_check_and_lock(
+/*============================*/
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ lock_prdt_t* prdt); /*!< in: Minimum Bound Rectangle */
+
+/*********************************************************************//**
+Append a predicate to the lock */
+void
+lock_prdt_set_prdt(
+/*===============*/
+ lock_t* lock, /*!< in: lock */
+ const lock_prdt_t* prdt); /*!< in: Predicate */
+
+#if 0
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+ const trx_t* trx, /*!< in: trx of new lock */
+ unsigned type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+ LOCK_INSERT_INTENTION */
+ lock_prdt_t* prdt, /*!< in: lock predicate to check */
+ const lock_t* lock2); /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+UNIV_INLINE
+rtr_mbr_t*
+prdt_get_mbr_from_prdt(
+/*===================*/
+ const lock_prdt_t* prdt); /*!< in: the lock predicate */
+
+
+#endif
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+void
+lock_prdt_rec_move(
+/*===============*/
+ const buf_block_t* receiver, /*!< in: buffer block containing
+ the receiving record */
+ const page_id_t donator); /*!< in: target page */
+
+/** Check whether there are R-tree Page lock on a page
+@param[in] trx trx to test the lock
+@param[in] page_id page identifier
+@return true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
+
+#endif
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
new file mode 100644
index 00000000..b0a5f7aa
--- /dev/null
+++ b/storage/innobase/include/lock0priv.h
@@ -0,0 +1,582 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "hash0hash.h"
+#include "rem0types.h"
+#include "trx0trx.h"
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+/** Print the table lock into the given output stream
+@param[in,out] out the output stream
+@return the given output stream. */
+inline
+std::ostream& lock_table_t::print(std::ostream& out) const
+{
+ out << "[lock_table_t: name=" << table->name << "]";
+ return(out);
+}
+
+/** The global output operator is overloaded to conveniently
+print the lock_table_t object into the given output stream.
+@param[in,out] out the output stream
+@param[in] lock the table lock
+@return the given output stream */
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_table_t& lock)
+{
+ return(lock.print(out));
+}
+
+inline
+std::ostream&
+ib_lock_t::print(std::ostream& out) const
+{
+ static_assert(LOCK_MODE_MASK == 7, "compatibility");
+ static_assert(LOCK_IS == 0, "compatibility");
+ static_assert(LOCK_IX == 1, "compatibility");
+ static_assert(LOCK_S == 2, "compatibility");
+ static_assert(LOCK_X == 3, "compatibility");
+ static_assert(LOCK_AUTO_INC == 4, "compatibility");
+ static_assert(LOCK_NONE == 5, "compatibility");
+ static_assert(LOCK_NONE_UNSET == 7, "compatibility");
+ const char *const modes[8]=
+ { "IS", "IX", "S", "X", "AUTO_INC", "NONE", "?", "NONE_UNSET" };
+
+ out << "[lock_t: type_mode=" << type_mode << "(" << type_string()
+ << " | LOCK_" << modes[mode()];
+
+ if (is_record_not_gap())
+ out << " | LOCK_REC_NOT_GAP";
+ if (is_waiting())
+ out << " | LOCK_WAIT";
+
+ if (is_gap())
+ out << " | LOCK_GAP";
+
+ if (is_insert_intention())
+ out << " | LOCK_INSERT_INTENTION";
+
+ out << ")";
+
+ if (is_table())
+ out << un_member.tab_lock;
+ else
+ out << un_member.rec_lock;
+
+ out << "]";
+ return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const ib_lock_t& lock)
+{
+ return(lock.print(out));
+}
+
+#ifdef UNIV_DEBUG
+extern ibool lock_print_waits;
+#endif /* UNIV_DEBUG */
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+ There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+ Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+ What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+ We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+ How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+ A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+/* LOCK COMPATIBILITY MATRIX
+ * IS IX S X AI
+ * IS + + + - +
+ * IX + + - - +
+ * S + - + - -
+ * X - - - - -
+ * AI + + - - -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+static const byte lock_compatibility_matrix[5][5] = {
+ /** IS IX S X AI */
+ /* IS */ { TRUE, TRUE, TRUE, FALSE, TRUE},
+ /* IX */ { TRUE, TRUE, FALSE, FALSE, TRUE},
+ /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE},
+ /* X */ { FALSE, FALSE, FALSE, FALSE, FALSE},
+ /* AI */ { TRUE, TRUE, FALSE, FALSE, FALSE}
+};
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ * IS IX S X AI
+ * IS + - - - -
+ * IX + + - - -
+ * S + - + - -
+ * X + + + + +
+ * AI - - - - +
+ * See lock_mode_stronger_or_eq().
+ */
+static const byte lock_strength_matrix[5][5] = {
+ /** IS IX S X AI */
+ /* IS */ { TRUE, FALSE, FALSE, FALSE, FALSE},
+ /* IX */ { TRUE, TRUE, FALSE, FALSE, FALSE},
+ /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE},
+ /* X */ { TRUE, TRUE, TRUE, TRUE, TRUE},
+ /* AI */ { FALSE, FALSE, FALSE, FALSE, TRUE}
+};
+
+#define PRDT_HEAPNO PAGE_HEAP_NO_INFIMUM
+/** Record locking request status */
+enum lock_rec_req_status {
+ /** Failed to acquire a lock */
+ LOCK_REC_FAIL,
+ /** Succeeded in acquiring a lock (implicit or already acquired) */
+ LOCK_REC_SUCCESS,
+ /** Explicitly created a new lock */
+ LOCK_REC_SUCCESS_CREATED
+};
+
+#ifdef UNIV_DEBUG
+/** The count of the types of locks. */
+static const ulint lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+ const lock_t* in_lock,/*!< in: record lock */
+ ulint heap_no);/*!< in: heap number of the record */
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+ const lock_t* lock); /*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+ const lock_t* lock, /*!< in: record lock */
+ ulint i); /*!< in: index of the bit */
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+ const lock_t* lock); /*!< in: record lock */
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+inline
+void
+lock_rec_set_nth_bit(
+/*=================*/
+ lock_t* lock, /*!< in: record lock */
+ ulint i); /*!< in: index of the bit */
+
+/** Reset the nth bit of a record lock.
+@param[in,out] lock record lock
+@param[in] i index of the bit that will be reset
+@return previous value of the bit */
+inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
+{
+ ut_ad(!lock->is_table());
+#ifdef SUX_LOCK_GENERIC
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+ || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+ ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+ byte* b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
+ byte mask = byte(1U << (i & 7));
+ byte bit = *b & mask;
+ *b &= byte(~mask);
+
+ if (bit != 0) {
+ ut_d(auto n=)
+ lock->trx->lock.n_rec_locks--;
+ ut_ad(n);
+ }
+
+ return(bit);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+ lock_t* lock); /*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+ ulint heap_no,/*!< in: heap number of the record */
+ lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+ ulint heap_no,/*!< in: heap number of the record */
+ const lock_t* lock); /*!< in: lock */
+
+/** Get the first explicit lock request on a record.
+@param cell first lock hash table cell
+@param id page identifier
+@param heap_no record identifier in page
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id,
+ ulint heap_no)
+{
+ lock_sys.assert_locked(cell);
+
+ for (lock_t *lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+ {
+ ut_ad(!lock->is_table());
+ if (lock->un_member.rec_lock.page_id == id &&
+ lock_rec_get_nth_bit(lock, heap_no))
+ return lock;
+ }
+ return nullptr;
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2); /*!< in: lock mode */
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2); /*!< in: lock mode */
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_table_t* table, /*!< in: table */
+ enum lock_mode mode); /*!< in: lock mode */
+
+#include "lock0priv.inl"
+
+#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl
new file mode 100644
index 00000000..3b4ebcc8
--- /dev/null
+++ b/storage/innobase/include/lock0priv.inl
@@ -0,0 +1,255 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+#include "row0row.h"
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+
+ return(row_get_rec_trx_id(rec, index, offsets));
+}
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+inline
+void
+lock_rec_set_nth_bit(
+/*=================*/
+ lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit */
+{
+ ulint byte_index;
+ ulint bit_index;
+
+ ut_ad(!lock->is_table());
+ ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+ byte_index = i / 8;
+ bit_index = i % 8;
+
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+ ((byte*) &lock[1])[byte_index] |= static_cast<byte>(1 << bit_index);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+#ifdef SUX_LOCK_GENERIC
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+ || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+ lock->trx->lock.n_rec_locks++;
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+ lock_t* lock) /*!< in: a record lock */
+{
+ return const_cast<lock_t*>(lock_rec_get_next_on_page_const(lock));
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+ ulint heap_no,/*!< in: heap number of the record */
+ lock_t* lock) /*!< in: lock */
+{
+ do {
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+ ulint heap_no,/*!< in: heap number of the record */
+ const lock_t* lock) /*!< in: lock */
+{
+ return lock_rec_get_next(heap_no, const_cast<lock_t*>(lock));
+}
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+ const lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit */
+{
+ const byte* b;
+
+ ut_ad(!lock->is_table());
+
+ if (i >= lock->un_member.rec_lock.n_bits) {
+
+ return(FALSE);
+ }
+
+ b = ((const byte*) &lock[1]) + (i / 8);
+
+ return(1 & *b >> (i % 8));
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+ const lock_t* lock) /*!< in: a record lock */
+{
+ ut_ad(!lock->is_table());
+
+ const page_id_t page_id{lock->un_member.rec_lock.page_id};
+
+ while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
+ if (lock->un_member.rec_lock.page_id == page_id)
+ break;
+ return lock;
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2) /*!< in: lock mode */
+{
+ ut_ad((ulint) mode1 < lock_types);
+ ut_ad((ulint) mode2 < lock_types);
+
+ return(lock_compatibility_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2) /*!< in: lock mode */
+{
+ ut_ad((ulint) mode1 < lock_types);
+ ut_ad((ulint) mode2 < lock_types);
+
+ return(lock_strength_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_table_t* table, /*!< in: table */
+ lock_mode in_mode)/*!< in: lock mode */
+{
+ /* Look for stronger locks the same trx already has on the table */
+
+ for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
+ end = trx->lock.table_locks.end(); it != end; ++it) {
+
+ const lock_t* lock = *it;
+
+ if (lock == NULL) {
+ continue;
+ }
+
+ ut_ad(trx == lock->trx);
+ ut_ad(lock->is_table());
+ ut_ad(lock->un_member.tab_lock.table);
+
+ if (table == lock->un_member.tab_lock.table
+ && lock_mode_stronger_or_eq(lock->mode(), in_mode)) {
+ ut_ad(!lock->is_waiting());
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
new file mode 100644
index 00000000..0d00b4b3
--- /dev/null
+++ b/storage/innobase/include/lock0types.h
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0types.h"
+#include "buf0types.h"
+#include "ut0lst.h"
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+
+struct lock_t;
+struct lock_table_t;
+
+/* Basic lock modes */
+enum lock_mode {
+ LOCK_IS = 0, /* intention shared */
+ LOCK_IX, /* intention exclusive */
+ LOCK_S, /* shared */
+ LOCK_X, /* exclusive */
+ LOCK_AUTO_INC, /* locks the auto-inc counter of a table
+ in an exclusive mode */
+ LOCK_NONE, /* this is used elsewhere to note consistent read */
+ LOCK_NUM = LOCK_NONE, /* number of lock modes */
+ LOCK_NONE_UNSET = 7
+};
+
+/** A table lock */
+struct lock_table_t {
+ dict_table_t* table; /*!< database table in dictionary
+ cache */
+ UT_LIST_NODE_T(ib_lock_t)
+ locks; /*!< list of locks on the same
+ table */
+ /** Print the table lock into the given output stream
+ @param[in,out] out the output stream
+ @return the given output stream. */
+ std::ostream& print(std::ostream& out) const;
+};
+
+/** Record lock for a page */
+struct lock_rec_t {
+ /** page identifier */
+ page_id_t page_id;
+ ib_uint32_t n_bits; /*!< number of bits in the lock
+ bitmap; NOTE: the lock bitmap is
+ placed immediately after the
+ lock struct */
+
+ /** Print the record lock into the given output stream
+ @param[in,out] out the output stream
+ @return the given output stream. */
+ std::ostream& print(std::ostream& out) const;
+};
+
+/** Print the record lock into the given output stream
+@param[in,out] out the output stream
+@return the given output stream. */
+inline std::ostream &lock_rec_t::print(std::ostream &out) const
+{
+ out << "[lock_rec_t: space=" << page_id.space()
+ << ", page_no=" << page_id.page_no()
+ << ", n_bits=" << n_bits << "]";
+ return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_rec_t& lock)
+{
+ return(lock.print(out));
+}
+
+#define LOCK_MODE_MASK 0x7 /*!< mask used to extract mode from the
+ type_mode field in a lock */
+/** Lock types */
+/* @{ */
+/** table lock (record lock if the flag is not set) */
+#define LOCK_TABLE 8U
+
+#define LOCK_WAIT 256U /*!< Waiting lock flag; when set, it
+ means that the lock has not yet been
+ granted, it is just waiting for its
+ turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary
+ next-key lock in contrast to LOCK_GAP
+ or LOCK_REC_NOT_GAP */
+#define LOCK_GAP 512U /*!< when this bit is set, it means that the
+ lock holds only on the gap before the record;
+ for instance, an x-lock on the gap does not
+ give permission to modify the record on which
+ the bit is set; locks of this type are created
+ when records are removed from the index chain
+ of records */
+#define LOCK_REC_NOT_GAP 1024U /*!< this bit means that the lock is only on
+ the index record and does NOT block inserts
+ to the gap before the index record; this is
+ used in the case when we retrieve a record
+ with a unique key, and is also used in
+ locking plain SELECTs (not part of UPDATE
+ or DELETE) when the user has set the READ
+ COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048U/*!< this bit is set when we place a waiting
+ gap type record lock request in order to let
+ an insert of an index record to wait until
+ there are no conflicting locks by other
+ transactions on the gap; note that this flag
+ remains set when the waiting lock is granted,
+ or if the lock is inherited to a neighboring
+ record */
+#define LOCK_PREDICATE 8192U /*!< Predicate lock */
+#define LOCK_PRDT_PAGE 16384U /*!< Page lock */
+
+
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/**
+Checks if the `mode` is LOCK_S or LOCK_X (possibly ORed with LOCK_WAIT or
+LOCK_REC) which means the lock is a
+Next Key Lock, a.k.a. LOCK_ORDINARY, as opposed to Predicate Lock,
+GAP lock, Insert Intention or Record Lock.
+@param mode A mode and flags, of a lock.
+@return true if the only bits set in `mode` are LOCK_S or LOCK_X and optionally
+LOCK_WAIT or LOCK_REC */
+static inline bool lock_mode_is_next_key_lock(ulint mode)
+{
+ static_assert(LOCK_ORDINARY == 0, "LOCK_ORDINARY must be 0 (no flags)");
+ ut_ad((mode & LOCK_TABLE) == 0);
+ mode&= ~LOCK_WAIT;
+ ut_ad((mode & LOCK_WAIT) == 0);
+ ut_ad(((mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY) ==
+ (mode == LOCK_S || mode == LOCK_X));
+ return (mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY;
+}
+
+/** Lock struct; protected by lock_sys.latch */
+struct ib_lock_t
+{
+ /** the owner of the lock */
+ trx_t *trx;
+ /** other locks of the transaction; protected by
+ lock_sys.is_writer() and trx->mutex_is_owner(); @see trx_lock_t::trx_locks */
+ UT_LIST_NODE_T(ib_lock_t) trx_locks;
+
+ dict_index_t* index; /*!< index for a record lock */
+
+ ib_lock_t* hash; /*!< hash chain node for a record
+ lock. The link node in a singly linked
+ list, used during hashing. */
+
+ /** time(NULL) of the lock request creation.
+ Used for computing wait_time and diagnostics only.
+ Note: bogus durations may be reported
+ when the system time is adjusted! */
+ time_t requested_time;
+ /** Cumulated wait time in seconds.
+ Note: may be bogus when the system time is adjusted! */
+ ulint wait_time;
+
+ union {
+ lock_table_t tab_lock;/*!< table lock */
+ lock_rec_t rec_lock;/*!< record lock */
+ } un_member; /*!< lock details */
+
+ ib_uint32_t type_mode; /*!< lock type, mode, LOCK_GAP or
+ LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION,
+ wait flag, ORed */
+
+ bool is_waiting() const
+ {
+ return(type_mode & LOCK_WAIT);
+ }
+
+ bool is_gap() const
+ {
+ return(type_mode & LOCK_GAP);
+ }
+
+ bool is_record_not_gap() const
+ {
+ return(type_mode & LOCK_REC_NOT_GAP);
+ }
+
+ /** @return true if the lock is a Next Key Lock */
+ bool is_next_key_lock() const
+ {
+ return !(type_mode & LOCK_TABLE) &&
+ lock_mode_is_next_key_lock(type_mode);
+ }
+
+ bool is_insert_intention() const
+ {
+ return(type_mode & LOCK_INSERT_INTENTION);
+ }
+
+ bool is_table() const { return type_mode & LOCK_TABLE; }
+
+ enum lock_mode mode() const
+ {
+ return(static_cast<enum lock_mode>(type_mode & LOCK_MODE_MASK));
+ }
+
+ bool is_rec_granted_exclusive_not_gap() const
+ {
+ return (type_mode & (LOCK_MODE_MASK | LOCK_GAP)) == LOCK_X;
+ }
+
+ /** Print the lock object into the given output stream.
+ @param[in,out] out the output stream
+ @return the given output stream. */
+ std::ostream& print(std::ostream& out) const;
+
+ const char* type_string() const
+ { return is_table() ? "LOCK_TABLE" : "LOCK_REC"; }
+};
+
+typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t;
+
+#endif /* lock0types_h */
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
new file mode 100644
index 00000000..22c0c963
--- /dev/null
+++ b/storage/innobase/include/log0crypt.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file include/log0crypt.h
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu
+Modified Jan Lindström jan.lindstrom@mariadb.com
+MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
+*******************************************************/
+#pragma once
+
+#include "log0log.h"
+
+/** Initialize the redo log encryption key and random parameters
+when creating a new redo log.
+The random parameters will be persisted in the log header.
+@see log_crypt_write_header()
+@see log_crypt_read_header()
+@return whether the operation succeeded */
+bool log_crypt_init();
+
+/** Add the encryption information to the log header buffer.
+@param buf part of log header buffer */
+void log_crypt_write_header(byte *buf);
+
+/** Read the encryption information from a redo log checkpoint buffer.
+@param buf part of checkpoint buffer
+@return whether the operation was successful */
+bool log_crypt_read_header(const byte *buf);
+
+/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
+@param[in] buf checkpoint buffer
+@return whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf);
+
+/** Decrypt a MariaDB 10.1 redo log block.
+@param[in,out] buf log block
+@param[in] start_lsn server start LSN
+@return whether the decryption was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn);
+
+/** Read the checkpoint crypto (version, msg and iv) info.
+@param[in] buf checkpoint buffer
+@return whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf);
+
+/** Decrypt log blocks.
+@param[in,out] buf log blocks to decrypt
+@param[in] lsn log sequence number of the start of the buffer
+@param[in] size size of the buffer, in bytes
+@return whether the operation succeeded */
+ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size);
+
+/** Decrypt part of a log record.
+@param iv initialization vector
+@param buf buffer for the decrypted data
+@param data the encrypted data
+@param len length of the data, in bytes
+@return buf */
+byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len);
+
+/** Decrypt a log snippet.
+@param iv initialization vector
+@param buf buffer to be replaced with encrypted contents
+@param end pointer past the end of buf */
+void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end);
+
+/** Encrypt or decrypt a temporary file block.
+@param[in] src block to encrypt or decrypt
+@param[in] size size of the block
+@param[out] dst destination block
+@param[in] offs offset to block
+@param[in] encrypt true=encrypt; false=decrypt
+@return whether the operation succeeded */
+bool log_tmp_block_encrypt(
+ const byte* src,
+ ulint size,
+ byte* dst,
+ uint64_t offs,
+ bool encrypt = true)
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Decrypt a temporary file block.
+@param[in] src block to decrypt
+@param[in] size size of the block
+@param[out] dst destination block
+@param[in] offs offset to block
+@return whether the operation succeeded */
+inline
+bool
+log_tmp_block_decrypt(
+ const byte* src,
+ ulint size,
+ byte* dst,
+ uint64_t offs)
+{
+ return(log_tmp_block_encrypt(src, size, dst, offs, false));
+}
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
new file mode 100644
index 00000000..f873eabf
--- /dev/null
+++ b/storage/innobase/include/log0log.h
@@ -0,0 +1,529 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0types.h"
+#include "os0file.h"
+#include "span.h"
+#include "my_atomic_wrapper.h"
+#include "srw_lock.h"
+#include <string>
+
+using st_::span;
+
+static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile";
+static const char LOG_FILE_NAME[] = "ib_logfile0";
+
+/** Composes full path for a redo log file
+@param[in] filename name of the redo log file
+@return path with log file name*/
+std::string get_log_file_path(const char *filename= LOG_FILE_NAME);
+
+/** Delete log file.
+@param[in] suffix suffix of the file name */
+static inline void delete_log_file(const char* suffix)
+{
+ auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix);
+ os_file_delete_if_exists_func(path.c_str(), nullptr);
+}
+
+struct completion_callback;
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param lsn log sequence number that should be included in the file write
+@param durable whether the write needs to be durable
+@param callback log write completion callback */
+void log_write_up_to(lsn_t lsn, bool durable,
+ const completion_callback *callback= nullptr);
+
+/** Write to the log file up to the last log entry.
+@param durable whether to wait for a durable write to complete */
+void log_buffer_flush_to_disk(bool durable= true);
+
+
+/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
+ATTRIBUTE_COLD void log_write_and_flush_prepare();
+
+/** Durably write the log up to log_sys.get_lsn(). */
+ATTRIBUTE_COLD void log_write_and_flush();
+
+/** Make a checkpoint */
+ATTRIBUTE_COLD void log_make_checkpoint();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins();
+
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+ FILE* file); /*!< in: file where to print */
+
+/** Offsets of a log file header */
+/* @{ */
+/** Log file header format identifier (32-bit unsigned big-endian integer).
+This used to be called LOG_GROUP_ID and always written as 0,
+because InnoDB never supported more than one copy of the redo log. */
+#define LOG_HEADER_FORMAT 0
+/** LSN of the start of data in this log file (with format version 1;
+in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */
+#define LOG_HEADER_START_LSN 8
+/** A null-terminated string which will contain either the string 'ibbackup'
+and the creation time if the log file was created by mysqlbackup --restore,
+or the MySQL version that created the redo log file. */
+#define LOG_HEADER_CREATOR 16
+/** End of the log file creator field. */
+#define LOG_HEADER_CREATOR_END 48
+/* @} */
+
+struct log_t;
+
+/** File abstraction */
+class log_file_t
+{
+ friend log_t;
+ os_file_t m_file{OS_FILE_CLOSED};
+public:
+ log_file_t()= default;
+ log_file_t(os_file_t file) noexcept : m_file(file) {}
+
+ /** Open a file
+ @return file size in bytes
+ @retval 0 if not readable */
+ os_offset_t open(bool read_only) noexcept;
+ bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; }
+
+ dberr_t close() noexcept;
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
+ void write(os_offset_t offset, span<const byte> buf) noexcept;
+ bool flush() const noexcept { return os_file_flush(m_file); }
+#ifdef HAVE_PMEM
+ byte *mmap(bool read_only, const struct stat &st) noexcept;
+#endif
+};
+
+/** Redo log buffer */
+struct log_t
+{
+ /** The original (not version-tagged) InnoDB redo log format */
+ static constexpr uint32_t FORMAT_3_23= 0;
+ /** The MySQL 5.7.9/MariaDB 10.2.2 log format */
+ static constexpr uint32_t FORMAT_10_2= 1;
+ /** The MariaDB 10.3.2 log format. */
+ static constexpr uint32_t FORMAT_10_3= 103;
+ /** The MariaDB 10.4.0 log format. */
+ static constexpr uint32_t FORMAT_10_4= 104;
+ /** Encrypted MariaDB redo log */
+ static constexpr uint32_t FORMAT_ENCRYPTED= 1U << 31;
+ /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */
+ static constexpr uint32_t FORMAT_ENC_10_4= FORMAT_10_4 | FORMAT_ENCRYPTED;
+ /** The MariaDB 10.5.1 physical redo log format */
+ static constexpr uint32_t FORMAT_10_5= 0x50485953;
+ /** The MariaDB 10.5.1 physical format (only with innodb_encrypt_log=ON) */
+ static constexpr uint32_t FORMAT_ENC_10_5= FORMAT_10_5 | FORMAT_ENCRYPTED;
+ /** The MariaDB 10.8.0 variable-block-size redo log format */
+ static constexpr uint32_t FORMAT_10_8= 0x50687973;
+ /** The MariaDB 10.8.0 format with innodb_encrypt_log=ON */
+ static constexpr uint32_t FORMAT_ENC_10_8= FORMAT_10_8 | FORMAT_ENCRYPTED;
+
+ /** Location of the first checkpoint block */
+ static constexpr size_t CHECKPOINT_1= 4096;
+ /** Location of the second checkpoint block */
+ static constexpr size_t CHECKPOINT_2= 8192;
+ /** Start of record payload */
+ static constexpr lsn_t START_OFFSET= 12288;
+
+ /** smallest possible log sequence number in the current format
+ (used to be 2048 before FORMAT_10_8). */
+ static constexpr lsn_t FIRST_LSN= START_OFFSET;
+
+private:
+ /** The log sequence number of the last change of durable InnoDB files */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+ std::atomic<lsn_t> lsn;
+ /** the first guaranteed-durable log sequence number */
+ std::atomic<lsn_t> flushed_to_disk_lsn;
+ /** log sequence number when log resizing was initiated, or 0 */
+ std::atomic<lsn_t> resize_lsn;
+ /** set when there may be need to flush the log buffer, or
+ preflush buffer pool pages, or initiate a log checkpoint.
+ This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
+ std::atomic<bool> check_flush_or_checkpoint_;
+
+
+#if defined(__aarch64__)
+/* On ARM, we do more spinning */
+typedef srw_spin_lock log_rwlock_t;
+#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST
+#else
+typedef srw_lock log_rwlock_t;
+#define LSN_LOCK_ATTR nullptr
+#endif
+
+public:
+ /** rw-lock protecting buf */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch;
+private:
+ /** Last written LSN */
+ lsn_t write_lsn;
+public:
+ /** log record buffer, written to by mtr_t::commit() */
+ byte *buf;
+ /** buffer for writing data to ib_logfile0, or nullptr if is_pmem()
+ In write_buf(), buf and flush_buf are swapped */
+ byte *flush_buf;
+ /** number of std::swap(buf, flush_buf) and writes from buf to log;
+ protected by latch.wr_lock() */
+ ulint write_to_log;
+
+ /** Log sequence number when a log file overwrite (broken crash recovery)
+ was noticed. Protected by latch.wr_lock(). */
+ lsn_t overwrite_warned;
+
+ /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */
+ size_t buf_size;
+
+private:
+ /** Log file being constructed during resizing; protected by latch */
+ log_file_t resize_log;
+ /** size of resize_log; protected by latch */
+ lsn_t resize_target;
+ /** Buffer for writing to resize_log; @see buf */
+ byte *resize_buf;
+ /** Buffer for writing to resize_log; @see flush_buf */
+ byte *resize_flush_buf;
+
+ /** spin lock protecting lsn, buf_free in append_prepare() */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock;
+ void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); }
+ void lock_lsn() { pthread_mutex_lock(&lsn_lock); }
+ void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); }
+ void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); }
+
+public:
+ /** first free offset within buf use; protected by lsn_lock */
+ Atomic_relaxed<size_t> buf_free;
+ /** number of write requests (to buf); protected by exclusive lsn_lock */
+ ulint write_to_buf;
+ /** number of waits in append_prepare(); protected by lsn_lock */
+ ulint waits;
+ /** recommended maximum size of buf, after which the buffer is flushed */
+ size_t max_buf_free;
+
+ /** log file size in bytes, including the header */
+ lsn_t file_size;
+private:
+ /** the log sequence number at the start of the log file */
+ lsn_t first_lsn;
+#if defined __linux__ || defined _WIN32
+ /** The physical block size of the storage */
+ uint32_t block_size;
+#endif
+public:
+ /** format of the redo log: e.g., FORMAT_10_8 */
+ uint32_t format;
+ /** Log file */
+ log_file_t log;
+#if defined __linux__ || defined _WIN32
+ /** whether file system caching is enabled for the log */
+ my_bool log_buffered;
+# ifdef _WIN32
+ static constexpr bool log_maybe_unbuffered= true;
+# else
+ /** whether file system caching may be disabled */
+ bool log_maybe_unbuffered;
+# endif
+#endif
+
+ /** Fields involved in checkpoints @{ */
+ lsn_t log_capacity; /*!< capacity of the log; if
+ the checkpoint age exceeds this, it is
+ a serious error because it is possible
+ we will then overwrite log and spoil
+ crash recovery */
+ lsn_t max_modified_age_async;
+ /*!< when this recommended
+ value for lsn -
+ buf_pool.get_oldest_modification()
+ is exceeded, we start an
+ asynchronous preflush of pool pages */
+ lsn_t max_checkpoint_age;
+ /*!< this is the maximum allowed value
+ for lsn - last_checkpoint_lsn when a
+ new query step is started */
+ /** latest completed checkpoint (protected by latch.wr_lock()) */
+ Atomic_relaxed<lsn_t> last_checkpoint_lsn;
+ /** next checkpoint LSN (protected by log_sys.latch) */
+ lsn_t next_checkpoint_lsn;
+ /** next checkpoint number (protected by latch.wr_lock()) */
+ ulint next_checkpoint_no;
+ /** whether a checkpoint is pending */
+ Atomic_relaxed<bool> checkpoint_pending;
+
+ /** buffer for checkpoint header */
+ byte *checkpoint_buf;
+ /* @} */
+
+ bool is_initialised() const noexcept { return max_buf_free != 0; }
+
+#ifdef HAVE_PMEM
+ bool is_pmem() const noexcept { return !flush_buf; }
+#else
+ static constexpr bool is_pmem() { return false; }
+#endif
+
+ bool is_opened() const noexcept { return log.is_opened(); }
+
+ /** @return LSN at which log resizing was started and is still in progress
+ @retval 0 if no log resizing is in progress */
+ lsn_t resize_in_progress() const noexcept
+ { return resize_lsn.load(std::memory_order_relaxed); }
+
+ /** Status of resize_start() */
+ enum resize_start_status {
+ RESIZE_NO_CHANGE, RESIZE_IN_PROGRESS, RESIZE_STARTED, RESIZE_FAILED
+ };
+
+ /** Start resizing the log and release the exclusive latch.
+ @param size requested new file_size
+ @return whether the resizing was started successfully */
+ resize_start_status resize_start(os_offset_t size) noexcept;
+
+ /** Abort any resize_start(). */
+ void resize_abort() noexcept;
+
+ /** Replicate a write to the log.
+ @param lsn start LSN
+ @param end end of the mini-transaction
+ @param len length of the mini-transaction
+ @param seq offset of the sequence bit from the end */
+ inline void resize_write(lsn_t lsn, const byte *end,
+ size_t len, size_t seq) noexcept;
+
+ /** Write resize_buf to resize_log.
+ @param length the used length of resize_buf */
+ ATTRIBUTE_COLD void resize_write_buf(size_t length) noexcept;
+
+ /** Rename a log file after resizing.
+ @return whether an error occurred */
+ static bool resize_rename() noexcept;
+
+#ifdef HAVE_PMEM
+ /** @return pointer for writing to resize_buf
+ @retval nullptr if no PMEM based resizing is active */
+ inline byte *resize_buf_begin(lsn_t lsn) const noexcept;
+ /** @return end of resize_buf */
+ inline const byte *resize_buf_end() const noexcept
+ { return resize_buf + resize_target; }
+
+ /** Initialise the redo log subsystem. */
+ void create_low();
+ /** Initialise the redo log subsystem.
+ @return whether the initialisation succeeded */
+ bool create() { create_low(); return true; }
+
+ /** Attach a log file.
+ @return whether the memory allocation succeeded */
+ bool attach(log_file_t file, os_offset_t size);
+#else
+ /** Initialise the redo log subsystem.
+ @return whether the initialisation succeeded */
+ bool create();
+ /** Attach a log file. */
+ void attach_low(log_file_t file, os_offset_t size);
+ bool attach(log_file_t file, os_offset_t size)
+ { attach_low(file, size); return true; }
+#endif
+
+#if defined __linux__ || defined _WIN32
+ /** Try to enable or disable file system caching (update log_buffered) */
+ void set_buffered(bool buffered);
+#endif
+
+ void close_file();
+
+ /** Calculate the checkpoint safety margins. */
+ static void set_capacity();
+
+ /** Write a log file header.
+ @param buf log header buffer
+ @param lsn log sequence number corresponding to log_sys.START_OFFSET
+ @param encrypted whether the log is encrypted */
+ static void header_write(byte *buf, lsn_t lsn, bool encrypted);
+
+ lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
+ { return lsn.load(order); }
+
+ lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire)
+ const noexcept
+ { return flushed_to_disk_lsn.load(order); }
+
+ /** Initialize the LSN on initial log file creation. */
+ lsn_t init_lsn() noexcept
+ {
+ latch.wr_lock(SRW_LOCK_CALL);
+ const lsn_t lsn{get_lsn()};
+ flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+ write_lsn= lsn;
+ latch.wr_unlock();
+ return lsn;
+ }
+
+ void set_recovered_lsn(lsn_t lsn) noexcept
+ {
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(latch.is_write_locked());
+#endif /* SUX_LOCK_GENERIC */
+ write_lsn= lsn;
+ this->lsn.store(lsn, std::memory_order_relaxed);
+ flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+ }
+
+#ifdef HAVE_PMEM
+ /** Persist the log.
+ @param lsn desired new value of flushed_to_disk_lsn */
+ inline void persist(lsn_t lsn) noexcept;
+#endif
+
+ bool check_flush_or_checkpoint() const
+ {
+ return UNIV_UNLIKELY
+ (check_flush_or_checkpoint_.load(std::memory_order_relaxed));
+ }
+ void set_check_flush_or_checkpoint(bool flag= true)
+ { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); }
+
+ /** Make previous write_buf() durable and update flushed_to_disk_lsn. */
+ bool flush(lsn_t lsn) noexcept;
+
+ /** Shut down the redo log subsystem. */
+ void close();
+
+#if defined __linux__ || defined _WIN32
+ /** @return the physical block size of the storage */
+ size_t get_block_size() const noexcept
+ { ut_ad(block_size); return block_size; }
+ /** Set the log block size for file I/O. */
+ void set_block_size(uint32_t size) noexcept { block_size= size; }
+#else
+ /** @return the physical block size of the storage */
+ static size_t get_block_size() { return 512; }
+#endif
+
+private:
+ /** Wait in append_prepare() for buffer to become available
+ @param ex whether log_sys.latch is exclusively locked */
+ ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept;
+public:
+ /** Reserve space in the log buffer for appending data.
+ @tparam pmem log_sys.is_pmem()
+ @param size total length of the data to append(), in bytes
+ @param ex whether log_sys.latch is exclusively locked
+ @return the start LSN and the buffer position for append() */
+ template<bool pmem>
+ inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
+
+ /** Append a string of bytes to the redo log.
+ @param d destination
+ @param s string of bytes
+ @param size length of str, in bytes */
+ void append(byte *&d, const void *s, size_t size) noexcept
+ {
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(latch.is_locked());
+#endif
+ ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size));
+ memcpy(d, s, size);
+ d+= size;
+ }
+
+ /** Set the log file format. */
+ void set_latest_format(bool encrypted) noexcept
+ { format= encrypted ? FORMAT_ENC_10_8 : FORMAT_10_8; }
+ /** @return whether the redo log is encrypted */
+ bool is_encrypted() const noexcept { return format & FORMAT_ENCRYPTED; }
+ /** @return whether the redo log is in the latest format */
+ bool is_latest() const noexcept
+ { return (~FORMAT_ENCRYPTED & format) == FORMAT_10_8; }
+
+ /** @return capacity in bytes */
+ lsn_t capacity() const noexcept { return file_size - START_OFFSET; }
+
+ /** Set the LSN of the log file at file creation. */
+ void set_first_lsn(lsn_t lsn) noexcept { write_lsn= first_lsn= lsn; }
+ /** @return the first LSN of the log file */
+ lsn_t get_first_lsn() const noexcept { return first_lsn; }
+
+ /** Determine the sequence bit at a log sequence number */
+ byte get_sequence_bit(lsn_t lsn) const noexcept
+ {
+ ut_ad(lsn >= first_lsn);
+ return !(((lsn - first_lsn) / capacity()) & 1);
+ }
+
+ /** Calculate the offset of a log sequence number.
+ @param lsn log sequence number
+ @return byte offset within ib_logfile0 */
+ lsn_t calc_lsn_offset(lsn_t lsn) const noexcept
+ {
+ ut_ad(lsn >= first_lsn);
+ return START_OFFSET + (lsn - first_lsn) % capacity();
+ }
+
+ /** Write checkpoint information and invoke latch.wr_unlock().
+ @param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
+ inline void write_checkpoint(lsn_t end_lsn) noexcept;
+
+ /** Write buf to ib_logfile0.
+ @tparam release_latch whether to invoke latch.wr_unlock()
+ @return the current log sequence number */
+ template<bool release_latch> inline lsn_t write_buf() noexcept;
+
+ /** Create the log. */
+ void create(lsn_t lsn) noexcept;
+};
+
+/** Redo log system */
+extern log_t log_sys;
+
+/** Wait for a log checkpoint if needed.
+NOTE that this function may only be called while not holding
+any synchronization objects except dict_sys.latch. */
+void log_free_check();
+
+/** Release the latches that protect log resizing. */
+void log_resize_release();
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
new file mode 100644
index 00000000..6d75e15a
--- /dev/null
+++ b/storage/innobase/include/log0recv.h
@@ -0,0 +1,491 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0new.h"
+#include "buf0types.h"
+#include "log0log.h"
+#include "mtr0types.h"
+
+#include <deque>
+#include <map>
+
+/** @return whether recovery is currently running. */
+#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
+
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Apply any buffered redo log to a page.
+@param space tablespace
+@param bpage buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage);
+
+/** Start recovering from a redo log checkpoint.
+of first system tablespace page
+@return error code or DB_SUCCESS */
+dberr_t recv_recovery_from_checkpoint_start();
+
+/** Report an operation to create, delete, or rename a file during backup.
+@param[in] space_id tablespace identifier
+@param[in] type file operation redo log type
+@param[in] name file name (not NUL-terminated)
+@param[in] len length of name, in bytes
+@param[in] new_name new file name (NULL if not rename)
+@param[in] new_len length of new_name, in bytes (0 if NULL) */
+extern void (*log_file_op)(uint32_t space_id, int type,
+ const byte* name, ulint len,
+ const byte* new_name, ulint new_len);
+
+/** Report an operation which does undo log tablespace truncation
+during backup
+@param space_id undo tablespace identifier */
+extern void (*undo_space_trunc)(uint32_t space_id);
+
+/** Report an operation which does INIT_PAGE for page0 during backup.
+@param space_id tablespace identifier */
+extern void (*first_page_init)(uint32_t space_id);
+
+/** Stored redo log record */
+struct log_rec_t
+{
+ log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); }
+ log_rec_t()= delete;
+ log_rec_t(const log_rec_t&)= delete;
+ log_rec_t &operator=(const log_rec_t&)= delete;
+
+ /** next record */
+ log_rec_t *next;
+ /** mtr_t::commit_lsn() of the mini-transaction */
+ const lsn_t lsn;
+};
+
+struct recv_dblwr_t
+{
+ /** Add a page frame to the doublewrite recovery buffer. */
+ void add(byte *page) { pages.push_front(page); }
+
+ /** Validate the page.
+ @param page_id page identifier
+ @param page page contents
+ @param space the tablespace of the page (not available for page 0)
+ @param tmp_buf 2*srv_page_size for decrypting and decompressing any
+ page_compressed or encrypted pages
+ @return whether the page is valid */
+ bool validate_page(const page_id_t page_id, const byte *page,
+ const fil_space_t *space, byte *tmp_buf);
+
+ /** Find a doublewrite copy of a page.
+ @param page_id page identifier
+ @param space tablespace (not available for page_id.page_no()==0)
+ @param tmp_buf 2*srv_page_size for decrypting and decompressing any
+ page_compressed or encrypted pages
+ @return page frame
+ @retval NULL if no valid page for page_id was found */
+ byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL,
+ byte *tmp_buf= NULL);
+
+ /** Restore the first page of the given tablespace from
+ doublewrite buffer.
+ @param space_id tablespace identifier
+ @param name tablespace filepath
+ @param file tablespace file handle
+ @return whether the operation failed */
+ bool restore_first_page(uint32_t space_id, const char *name, os_file_t file);
+
+ typedef std::deque<byte*, ut_allocator<byte*> > list;
+
+ /** Recovered doublewrite buffer page frames */
+ list pages;
+};
+
+/** recv_sys.pages entry; protected by recv_sys.mutex */
+struct page_recv_t
+{
+ /** Recovery status: 0=not in progress, 1=log is being applied,
+ -1=log has been applied and the entry may be erased.
+ Transitions from 1 to -1 are NOT protected by recv_sys.mutex. */
+ Atomic_relaxed<int8_t> being_processed{0};
+ /** Whether reading the page will be skipped */
+ bool skip_read= false;
+ /** Latest written byte offset when applying the log records.
+ @see mtr_t::m_last_offset */
+ uint16_t last_offset= 1;
+ /** log records for a page */
+ class recs_t
+ {
+ /** The first log record */
+ log_rec_t *head= nullptr;
+ /** The last log record */
+ log_rec_t *tail= nullptr;
+ friend struct page_recv_t;
+ public:
+ /** Append a redo log snippet for the page
+ @param recs log snippet */
+ void append(log_rec_t* recs)
+ {
+ if (tail)
+ tail->next= recs;
+ else
+ head= recs;
+ tail= recs;
+ }
+ /** Remove the last records for the page
+ @param start_lsn start of the removed log */
+ ATTRIBUTE_COLD void rewind(lsn_t start_lsn);
+
+ /** @return the last log snippet */
+ const log_rec_t* last() const { return tail; }
+ /** @return the last log snippet */
+ log_rec_t* last() { return tail; }
+
+ class iterator
+ {
+ log_rec_t *cur;
+ public:
+ iterator(log_rec_t* rec) : cur(rec) {}
+ log_rec_t* operator*() const { return cur; }
+ iterator &operator++() { cur= cur->next; return *this; }
+ bool operator!=(const iterator& i) const { return cur != i.cur; }
+ };
+ iterator begin() { return head; }
+ iterator end() { return NULL; }
+ bool empty() const { ut_ad(!head == !tail); return !head; }
+ /** Clear and free the records; @see recv_sys_t::add() */
+ void clear();
+ } log;
+
+ /** Trim old log records for a page.
+ @param start_lsn oldest log sequence number to preserve
+ @return whether all the log for the page was trimmed */
+ inline bool trim(lsn_t start_lsn);
+ /** Ignore any earlier redo log records for this page. */
+ inline void will_not_read();
+};
+
+/** A page initialization operation that was parsed from the redo log */
+struct recv_init
+{
+ /** log sequence number of the page initialization */
+ lsn_t lsn;
+ /** Whether btr_page_create() avoided a read of the page.
+ At the end of the last recovery batch, mark_ibuf_exist()
+ will mark pages for which this flag is set. */
+ bool created;
+};
+
+/** Recovery system data structure */
+struct recv_sys_t
+{
+ using init= recv_init;
+
+ /** mutex protecting this as well as some of page_recv_t */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+private:
+ /** set when finding a corrupt log block or record, or there is a
+ log parsing buffer overflow */
+ bool found_corrupt_log;
+ /** set when an inconsistency with the file system contents is detected
+ during log scan or apply */
+ bool found_corrupt_fs;
+public:
+ /** @return maximum guaranteed size of a mini-transaction on recovery */
+ static constexpr size_t MTR_SIZE_MAX{1U << 20};
+
+ /** whether we are applying redo log records during crash recovery */
+ bool recovery_on;
+ /** whether recv_recover_page(), invoked from buf_page_t::read_complete(),
+ should apply log records*/
+ bool apply_log_recs;
+ /** number of bytes in log_sys.buf */
+ size_t len;
+ /** start offset of non-parsed log records in log_sys.buf */
+ size_t offset;
+ /** log sequence number of the first non-parsed record */
+ lsn_t lsn;
+ /** log sequence number of the last parsed mini-transaction */
+ lsn_t scanned_lsn;
+ /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */
+ lsn_t file_checkpoint;
+ /** the time when progress was last reported */
+ time_t progress_time;
+
+ using map = std::map<const page_id_t, page_recv_t,
+ std::less<const page_id_t>,
+ ut_allocator<std::pair<const page_id_t, page_recv_t>>>;
+ /** buffered records waiting to be applied to pages */
+ map pages;
+
+private:
+ /** iterator to pages, used by parse() */
+ map::iterator pages_it;
+
+ /** Process a record that indicates that a tablespace size is being shrunk.
+ @param page_id first page that is not in the file
+ @param lsn log sequence number of the shrink operation */
+ inline void trim(const page_id_t page_id, lsn_t lsn);
+
+ /** Undo tablespaces for which truncate has been logged
+ (indexed by page_id_t::space() - srv_undo_space_id_start) */
+ struct trunc
+ {
+ /** log sequence number of FILE_CREATE, or 0 if none */
+ lsn_t lsn;
+ /** truncated size of the tablespace, or 0 if not truncated */
+ unsigned pages;
+ } truncated_undo_spaces[127];
+
+public:
+ /** The contents of the doublewrite buffer */
+ recv_dblwr_t dblwr;
+
+ __attribute__((warn_unused_result))
+ inline dberr_t read(os_offset_t offset, span<byte> buf);
+ inline size_t files_size();
+ void close_files();
+
+ /** Advance pages_it if it matches the iterator */
+ void pages_it_invalidate(const map::iterator &p)
+ {
+ mysql_mutex_assert_owner(&mutex);
+ if (pages_it == p)
+ pages_it++;
+ }
+ /** Invalidate pages_it if it points to the given tablespace */
+ void pages_it_invalidate(uint32_t space_id)
+ {
+ mysql_mutex_assert_owner(&mutex);
+ if (pages_it != pages.end() && pages_it->first.space() == space_id)
+ pages_it= pages.end();
+ }
+
+private:
+ /** Attempt to initialize a page based on redo log records.
+ @param p iterator
+ @param mtr mini-transaction
+ @param b pre-allocated buffer pool block
+ @param init page initialization
+ @return the recovered block
+ @retval nullptr if the page cannot be initialized based on log records
+ @retval -1 if the page cannot be recovered due to corruption */
+ inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr,
+ buf_block_t *b, init &init);
+ /** Attempt to initialize a page based on redo log records.
+ @param page_id page identifier
+ @return the recovered block
+ @retval nullptr if the page cannot be initialized based on log records
+ @retval -1 if the page cannot be recovered due to corruption */
+ ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id);
+
+ /** All found log files (multiple ones are possible if we are upgrading
+ from before MariaDB Server 10.5.1) */
+ std::vector<log_file_t> files;
+
+ /** Base node of the redo block list.
+ List elements are linked via buf_block_t::unzip_LRU. */
+ UT_LIST_BASE_NODE_T(buf_block_t) blocks;
+
+ /** Allocate a block from the buffer pool for recv_sys.pages */
+ ATTRIBUTE_COLD buf_block_t *add_block();
+
+ /** Wait for buffer pool to become available.
+ @param pages number of buffer pool pages needed */
+ ATTRIBUTE_COLD void wait_for_pool(size_t pages);
+
+ /** Free log for processed pages. */
+ void garbage_collect();
+
+ /** Apply a recovery batch.
+ @param space_id current tablespace identifier
+ @param space current tablespace
+ @param free_block spare buffer block
+ @param last_batch whether it is possible to write more redo log
+ @return whether the caller must provide a new free_block */
+ bool apply_batch(uint32_t space_id, fil_space_t *&space,
+ buf_block_t *&free_block, bool last_batch);
+
+public:
+ /** Apply buffered log to persistent data pages.
+ @param last_batch whether it is possible to write more redo log */
+ void apply(bool last_batch);
+
+#ifdef UNIV_DEBUG
+ /** whether all redo log in the current batch has been applied */
+ bool after_apply= false;
+#endif
+ /** Initialize the redo log recovery subsystem. */
+ void create();
+
+ /** Free most recovery data structures. */
+ void debug_free();
+
+ /** Clean up after create() */
+ void close();
+
+ bool is_initialised() const { return scanned_lsn != 0; }
+
+ /** Find the latest checkpoint.
+ @return error code or DB_SUCCESS */
+ dberr_t find_checkpoint();
+
+ /** Register a redo log snippet for a page.
+ @param it page iterator
+ @param start_lsn start LSN of the mini-transaction
+ @param lsn @see mtr_t::commit_lsn()
+ @param l redo log snippet
+ @param len length of l, in bytes
+ @return whether we ran out of memory */
+ bool add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
+ const byte *l, size_t len);
+
+ /** Parsing result */
+ enum parse_mtr_result {
+ /** a record was successfully parsed */
+ OK,
+ /** the log ended prematurely (need to read more) */
+ PREMATURE_EOF,
+ /** the end of the log was reached */
+ GOT_EOF,
+ /** parse<true>(l, false) ran out of memory */
+ GOT_OOM
+ };
+
+private:
+ /** Parse and register one log_t::FORMAT_10_8 mini-transaction.
+ @tparam store whether to store the records
+ @param l log data source
+ @param if_exists if store: whether to check if the tablespace exists */
+ template<typename source,bool store>
+ inline parse_mtr_result parse(source &l, bool if_exists) noexcept;
+
+ /** Rewind a mini-transaction when parse() runs out of memory.
+ @param l log data source
+ @param begin start of the mini-transaction */
+ template<typename source>
+ ATTRIBUTE_COLD void rewind(source &l, source &begin) noexcept;
+
+ /** Report progress in terms of LSN or pages remaining */
+ ATTRIBUTE_COLD void report_progress() const;
+public:
+ /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
+ handling log_sys.is_pmem() buffer wrap-around.
+ @tparam store whether to store the records
+ @param if_exists if store: whether to check if the tablespace exists */
+ template<bool store>
+ static parse_mtr_result parse_mtr(bool if_exists) noexcept;
+
+ /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
+ handling log_sys.is_pmem() buffer wrap-around.
+ @tparam store whether to store the records
+ @param if_exists if store: whether to check if the tablespace exists */
+ template<bool store>
+ static parse_mtr_result parse_pmem(bool if_exists) noexcept
+#ifdef HAVE_PMEM
+ ;
+#else
+ { return parse_mtr<store>(if_exists); }
+#endif
+
+ /** Erase log records for a page. */
+ void erase(map::iterator p);
+
+ /** Clear a fully processed set of stored redo log records. */
+ void clear();
+
+ /** Determine whether redo log recovery progress should be reported.
+ @param time the current time
+ @return whether progress should be reported
+ (the last report was at least 15 seconds ago) */
+ bool report(time_t time);
+
+ /** The alloc() memory alignment, in bytes */
+ static constexpr size_t ALIGNMENT= sizeof(size_t);
+
+ /** Free a redo log snippet.
+ @param data buffer allocated in add() */
+ inline void free(const void *data);
+
+ /** Remove records for a corrupted page.
+ This function should only be called when innodb_force_recovery is set.
+ @param page_id corrupted page identifier */
+ ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
+
+ /** Flag data file corruption during recovery. */
+ ATTRIBUTE_COLD void set_corrupt_fs();
+ /** Flag log file corruption during recovery. */
+ ATTRIBUTE_COLD void set_corrupt_log();
+
+ /** @return whether data file corruption was found */
+ bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); }
+ /** @return whether log file corruption was found */
+ bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }
+
+ /** Attempt to initialize a page based on redo log records.
+ @param page_id page identifier
+ @return the recovered block
+ @retval nullptr if the page cannot be initialized based on log records
+ @retval -1 if the page cannot be recovered due to corruption */
+ buf_block_t *recover(const page_id_t page_id)
+ {
+ return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
+ }
+
+ /** Try to recover a tablespace that was not readable earlier
+ @param p iterator
+ @param name tablespace file name
+ @param free_block spare buffer block
+ @return recovered tablespace
+ @retval nullptr if recovery failed */
+ fil_space_t *recover_deferred(const map::iterator &p,
+ const std::string &name,
+ buf_block_t *&free_block);
+};
+
+/** The recovery system */
+extern recv_sys_t recv_sys;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this will be set if
+recv_sys.pages becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern bool recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern bool recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** whether writing to the redo log is forbidden;
+protected by exclusive log_sys.latch. */
+extern bool recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
+recv_recovery_from_checkpoint_start(). */
+extern bool recv_lsn_checks_on;
diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h
new file mode 100644
index 00000000..df87968d
--- /dev/null
+++ b/storage/innobase/include/log0types.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0types.h
+Log types
+
+Created 2013-03-15 Sunny Bains
+*******************************************************/
+
+#ifndef log0types_h
+#define log0types_h
+
+#include "univ.i"
+
+/* Type used for all log sequence number storage and arithmetics */
+typedef ib_uint64_t lsn_t;
+
+#define LSN_MAX IB_UINT64_MAX
+
+#define LSN_PF UINT64PF
+
+#endif /* log0types_h */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
new file mode 100644
index 00000000..79cbd7d1
--- /dev/null
+++ b/storage/innobase/include/mach0data.h
@@ -0,0 +1,375 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "mtr0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */
+/** The following function is used to fetch data from one byte.
+@param[in] b pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */
+#endif /* !UNIV_INNOCHECKSUM */
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 2 bytes where to store
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifndef UNIV_INNOCHECKSUM
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+ MY_ATTRIBUTE((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+ MY_ATTRIBUTE((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 3 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer to be stored */
+ MY_ATTRIBUTE((const));
+/** Read a 32-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+ const byte** b);
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ ib_uint64_t id); /*!< in: 48-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ ib_uint64_t n); /*!< in: 56-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ void* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/** Read a 64-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+ const byte** b);
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d); /*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d); /*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+ MY_ATTRIBUTE((warn_unused_result));
+
+
+/** Reads a 64 bit stored in big endian format
+@param buf From where to read
+@return uint64_t */
+UNIV_INLINE
+uint64_t
+mach_read_uint64_little_endian(const byte* buf)
+{
+#ifdef WORDS_BIGENDIAN
+ return
+ uint64_t(buf[0]) | uint64_t(buf[1]) << 8 |
+ uint64_t(buf[2]) << 16 | uint64_t(buf[3]) << 24 |
+ uint64_t(buf[4]) << 32 | uint64_t(buf[5]) << 40 |
+ uint64_t(buf[6]) << 48 | uint64_t(buf[7]) << 56;
+#else
+ uint64_t n;
+ memcpy(&n, buf, sizeof(uint64_t));
+ return n;
+#endif
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n); /*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n); /*!< in: unsigned long int to write */
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type); /*!< in: signed or unsigned flag */
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+ byte* dest, /*!< in: where to write */
+ ulonglong src, /*!< in: where to read from */
+ ulint len, /*!< in: length of dest */
+ bool usign); /*!< in: signed or unsigned flag */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "mach0data.inl"
+
+#endif
diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl
new file mode 100644
index 00000000..2f970fd2
--- /dev/null
+++ b/storage/innobase/include/mach0data.inl
@@ -0,0 +1,837 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "mtr0types.h"
+#include "ut0byte.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+ ut_ad((n & ~0xFFUL) == 0);
+
+ b[0] = (byte) n;
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad((n & ~0xFFFFUL) == 0);
+
+ b[0] = (byte)(n >> 8);
+ b[1] = (byte)(n);
+}
+
+/** The following function is used to fetch data from one byte.
+@param[in] b pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+ const byte* b)
+{
+ return(uint8_t(*b));
+}
+
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 2 bytes to read
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+ const byte* b)
+{
+ return(uint16_t(uint16_t(b[0]) << 8 | b[1]));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+{
+ uint16 ret;
+ ut_ad(2 == sizeof ret);
+ mach_write_to_2((byte*) &ret, n);
+ return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+{
+ ut_ad(2 == sizeof n);
+ return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad((n & ~0xFFFFFFUL) == 0);
+
+ b[0] = (byte)(n >> 16);
+ b[1] = (byte)(n >> 8);
+ b[2] = (byte)(n);
+}
+
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 3 bytes to read
+@return uint32_t integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+ const byte* b)
+{
+ return( (static_cast<uint32_t>(b[0]) << 16)
+ | (static_cast<uint32_t>(b[1]) << 8)
+ | static_cast<uint32_t>(b[2])
+ );
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ b[0] = (byte)(n >> 24);
+ b[1] = (byte)(n >> 16);
+ b[2] = (byte)(n >> 8);
+ b[3] = (byte) n;
+}
+
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+ const byte* b)
+{
+ return( (static_cast<uint32_t>(b[0]) << 24)
+ | (static_cast<uint32_t>(b[1]) << 16)
+ | (static_cast<uint32_t>(b[2]) << 8)
+ | static_cast<uint32_t>(b[3])
+ );
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ mach_write_to_1(b, n);
+ return(1);
+ } else if (n < 0x4000) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ mach_write_to_2(b, n | 0x8000);
+ return(2);
+ } else if (n < 0x200000) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ mach_write_to_3(b, n | 0xC00000);
+ return(3);
+ } else if (n < 0x10000000) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ mach_write_to_4(b, n | 0xE0000000);
+ return(4);
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ mach_write_to_1(b, 0xF0);
+ mach_write_to_4(b + 1, n);
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ return(1);
+ } else if (n < 0x4000) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ return(2);
+ } else if (n < 0x200000) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ return(3);
+ } else if (n < 0x10000000) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ return(4);
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint val;
+
+ val = mach_read_from_1(b);
+
+ if (val < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ } else if (val < 0xC0) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ val = mach_read_from_2(b) & 0x3FFF;
+ ut_ad(val > 0x7F);
+ } else if (val < 0xE0) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ val = mach_read_from_3(b) & 0x1FFFFF;
+ ut_ad(val > 0x3FFF);
+ } else if (val < 0xF0) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ val = mach_read_from_4(b) & 0xFFFFFFF;
+ ut_ad(val > 0x1FFFFF);
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ ut_ad(val == 0xF0);
+ val = mach_read_from_4(b + 1);
+ ut_ad(val > 0xFFFFFFF);
+ }
+
+ return(val);
+}
+
+/** Read a 32-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+ const byte** b)
+{
+ ulint val = mach_read_from_1(*b);
+
+ if (val < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ ++*b;
+ } else if (val < 0xC0) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ val = mach_read_from_2(*b) & 0x3FFF;
+ ut_ad(val > 0x7F);
+ *b += 2;
+ } else if (val < 0xE0) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ val = mach_read_from_3(*b) & 0x1FFFFF;
+ ut_ad(val > 0x3FFF);
+ *b += 3;
+ } else if (val < 0xF0) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ val = mach_read_from_4(*b) & 0xFFFFFFF;
+ ut_ad(val > 0x1FFFFF);
+ *b += 4;
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ ut_ad(val == 0xF0);
+ val = mach_read_from_4(*b + 1);
+ ut_ad(val > 0xFFFFFFF);
+ *b += 5;
+ }
+
+ return(static_cast<ib_uint32_t>(val));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ void* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+ mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+{
+ ib_uint64_t u64;
+
+ u64 = mach_read_from_4(b);
+ u64 <<= 32;
+ u64 |= mach_read_from_4(b + 4);
+
+ return(u64);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ ib_uint64_t n) /*!< in: 56-bit integer */
+{
+ mach_write_to_3(b, (ulint) (n >> 32));
+ mach_write_to_4(b + 3, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+{
+ return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3)));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ ib_uint64_t n) /*!< in: 48-bit integer */
+{
+ mach_write_to_2(b, (ulint) (n >> 32));
+ mach_write_to_4(b + 2, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+{
+ return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2)));
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ ulint size = mach_write_compressed(b, (ulint) (n >> 32));
+ mach_write_to_4(b + size, (ulint) n);
+
+ return(size + 4);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+ const byte** b)
+{
+ ib_uint64_t val;
+
+ val = mach_read_next_compressed(b);
+ val <<= 32;
+ val |= mach_read_from_4(*b);
+ *b += 4;
+ return(val);
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ ulint size;
+
+ if (!(n >> 32)) {
+ return(mach_write_compressed(b, (ulint) n));
+ }
+
+ *b = (byte)0xFF;
+ size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32));
+
+ size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF);
+
+ return(size);
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ib_uint64_t n;
+
+ if (*b != 0xFF) {
+ return(mach_read_compressed(b));
+ }
+
+ b++;
+ n = mach_read_next_compressed(&b);
+ n <<= 32;
+ n |= mach_read_compressed(b);
+
+ return(n);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_next_much_compressed(
+ const byte** b)
+{
+ ib_uint64_t val = mach_read_from_1(*b);
+
+ if (val < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ ++*b;
+ } else if (val < 0xC0) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ val = mach_read_from_2(*b) & 0x3FFF;
+ ut_ad(val > 0x7F);
+ *b += 2;
+ } else if (val < 0xE0) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ val = mach_read_from_3(*b) & 0x1FFFFF;
+ ut_ad(val > 0x3FFF);
+ *b += 3;
+ } else if (val < 0xF0) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ val = mach_read_from_4(*b) & 0xFFFFFFF;
+ ut_ad(val > 0x1FFFFF);
+ *b += 4;
+ } else if (val == 0xF0) {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ val = mach_read_from_4(*b + 1);
+ ut_ad(val > 0xFFFFFFF);
+ *b += 5;
+ } else {
+ /* 11111111 followed by up to 64 bits */
+ ut_ad(val == 0xFF);
+ ++*b;
+ val = mach_read_next_compressed(b);
+ ut_ad(val > 0);
+ val <<= 32;
+ val |= mach_read_next_compressed(b);
+ }
+
+ return(val);
+}
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ double d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(double) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d) /*!< in: double */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(double) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ float d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(float) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d) /*!< in: float */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(float) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+{
+ ulint n = 0;
+ const byte* ptr;
+
+ ut_ad(buf_size > 0);
+
+ ptr = buf + buf_size;
+
+ for (;;) {
+ ptr--;
+
+ n = n << 8;
+
+ n += (ulint)(*ptr);
+
+ if (ptr == buf) {
+ break;
+ }
+ }
+
+ return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ byte* end;
+
+ ut_ad(dest_size <= sizeof(ulint));
+ ut_ad(dest_size > 0);
+
+ end = dest + dest_size;
+
+ for (;;) {
+ *dest = (byte)(n & 0xFF);
+
+ n = n >> 8;
+
+ dest++;
+
+ if (dest == end) {
+ break;
+ }
+ }
+
+ ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+{
+ return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ ut_ad(n < 256 * 256);
+
+ *dest = (byte)(n & 0xFFUL);
+
+ n = n >> 8;
+ dest++;
+
+ *dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ /* XXX this can be optimized on big-endian machines */
+
+ uintmax_t ret;
+ uint i;
+
+ if (unsigned_type || (src[0] & 0x80)) {
+
+ ret = 0x0000000000000000ULL;
+ } else {
+
+ ret = 0xFFFFFFFFFFFFFF00ULL;
+ }
+
+ if (unsigned_type) {
+
+ ret |= src[0];
+ } else {
+
+ ret |= src[0] ^ 0x80;
+ }
+
+ for (i = 1; i < len; i++) {
+ ret <<= 8;
+ ret |= src[i];
+ }
+
+ return(ret);
+}
+/*********************************************************//**
+Swap byte ordering. */
+UNIV_INLINE
+void
+mach_swap_byte_order(
+/*=================*/
+ byte* dest, /*!< out: where to write */
+ const byte* from, /*!< in: where to read from */
+ ulint len) /*!< in: length of src */
+{
+ ut_ad(len > 0);
+ ut_ad(len <= 8);
+
+ dest += len;
+
+ switch (len & 0x7) {
+ case 0: *--dest = *from++; /* fall through */
+ case 7: *--dest = *from++; /* fall through */
+ case 6: *--dest = *from++; /* fall through */
+ case 5: *--dest = *from++; /* fall through */
+ case 4: *--dest = *from++; /* fall through */
+ case 3: *--dest = *from++; /* fall through */
+ case 2: *--dest = *from++; /* fall through */
+ case 1: *--dest = *from;
+ }
+}
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+ byte* dest, /*!< in: where to write */
+ ulonglong src, /*!< in: where to read from */
+ ulint len, /*!< in: length of dest */
+ bool usign) /*!< in: signed or unsigned flag */
+{
+ byte* ptr = reinterpret_cast<byte*>(&src);
+
+ ut_ad(len <= sizeof(ulonglong));
+
+#ifdef WORDS_BIGENDIAN
+ memcpy(dest, ptr + (sizeof(src) - len), len);
+#else
+ mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len);
+#endif /* WORDS_BIGENDIAN */
+
+ if (!usign) {
+ *dest ^= 0x80;
+ }
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mariadb_stats.h b/storage/innobase/include/mariadb_stats.h
new file mode 100644
index 00000000..e9051c0c
--- /dev/null
+++ b/storage/innobase/include/mariadb_stats.h
@@ -0,0 +1,119 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Foundation
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef mariadb_stats_h
+#define mariadb_stats_h
+
+/* Include file to handle mariadbd handler specific stats */
+
+#include "ha_handler_stats.h"
+#include "my_rdtsc.h"
+
+/* Not active threads are ponting to this structure */
+extern thread_local ha_handler_stats mariadb_dummy_stats;
+
+/* Points to either THD->handler_stats or mariad_dummy_stats */
+extern thread_local ha_handler_stats *mariadb_stats;
+
+/*
+ Returns 1 if MariaDB wants engine status
+*/
+
+inline bool mariadb_stats_active()
+{
+ return mariadb_stats->active != 0;
+}
+
+inline bool mariadb_stats_active(ha_handler_stats *stats)
+{
+ return stats->active != 0;
+}
+
+/* The following functions increment different engine status */
+
+inline void mariadb_increment_pages_accessed()
+{
+ mariadb_stats->pages_accessed++;
+}
+
+inline void mariadb_increment_pages_updated(ulonglong count)
+{
+ mariadb_stats->pages_updated+= count;
+}
+
+inline void mariadb_increment_pages_read()
+{
+ mariadb_stats->pages_read_count++;
+}
+
+inline void mariadb_increment_undo_records_read()
+{
+ mariadb_stats->undo_records_read++;
+}
+
+/*
+ The following has to be identical code as measure() in sql_analyze_stmt.h
+
+ One should only call this if mariadb_stats_active() is true.
+*/
+
+inline ulonglong mariadb_measure()
+{
+#if (MY_TIMER_ROUTINE_CYCLES)
+ return my_timer_cycles();
+#else
+ return my_timer_microseconds();
+#endif
+}
+
+/*
+ Call this only of start_time != 0
+ See buf0rea.cc for an example of how to use it efficiently
+*/
+
+inline void mariadb_increment_pages_read_time(ulonglong start_time)
+{
+ ha_handler_stats *stats= mariadb_stats;
+ ulonglong end_time= mariadb_measure();
+ /* Check that we only call this if active, see example! */
+ DBUG_ASSERT(start_time);
+ DBUG_ASSERT(mariadb_stats_active(stats));
+
+ stats->pages_read_time+= (end_time - start_time);
+}
+
+
+/*
+ Helper class to set mariadb_stats temporarly for one call in handler.cc
+*/
+
+class mariadb_set_stats
+{
+public:
+ uint flag;
+ mariadb_set_stats(ha_handler_stats *stats)
+ {
+ mariadb_stats= stats ? stats : &mariadb_dummy_stats;
+ }
+ ~mariadb_set_stats()
+ {
+ mariadb_stats= &mariadb_dummy_stats;
+ }
+};
+
+#endif /* mariadb_stats_h */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
new file mode 100644
index 00000000..959147a6
--- /dev/null
+++ b/storage/innobase/include/mem0mem.h
@@ -0,0 +1,345 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "mach0data.h"
+
+#include <memory>
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/** A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef struct mem_block_info_t mem_block_t;
+
+/** A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t mem_heap_t;
+
+/** Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC 0 /* the most common type */
+#define MEM_HEAP_BUFFER 1
+#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be
+ ORed to MEM_HEAP_BUFFER, in which
+ case heap->free_block is used in
+ some cases for memory allocations,
+ and if it's NULL, the memory
+ allocation functions can return
+ NULL. */
+
+/** Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH (MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP (MEM_HEAP_BUFFER)
+
+/** The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE 64
+#define MEM_BLOCK_STANDARD_SIZE \
+ (srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/** If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF (srv_page_size - 200 + REDZONE_SIZE)
+
+/** Space needed when allocating for a user a field of length N.
+The space is allocated only in multiples of UNIV_MEM_ALIGNMENT. */
+#define MEM_SPACE_NEEDED(N) UT_CALC_ALIGN((N), UNIV_MEM_ALIGNMENT)
+
+#ifdef UNIV_DEBUG
+/** Macro for memory heap creation.
+@param[in] size Desired start block size. */
+# define mem_heap_create(size) \
+ mem_heap_create_func((size), __FILE__, __LINE__, MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in] size Desired start block size.
+@param[in] type Heap type */
+# define mem_heap_create_typed(size, type) \
+ mem_heap_create_func((size), __FILE__, __LINE__, (type))
+
+#else /* UNIV_DEBUG */
+/** Macro for memory heap creation.
+@param[in] size Desired start block size. */
+# define mem_heap_create(size) mem_heap_create_func((size), MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in] size Desired start block size.
+@param[in] type Heap type */
+# define mem_heap_create_typed(size, type) \
+ mem_heap_create_func((size), (type))
+
+#endif /* UNIV_DEBUG */
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in] size Desired start block size.
+@param[in] file_name File name where created
+@param[in] line Line where created
+@param[in] type Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+ ulint size,
+#ifdef UNIV_DEBUG
+ const char* file_name,
+ unsigned line,
+#endif /* UNIV_DEBUG */
+ ulint type);
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in] heap Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+ mem_heap_t* heap);
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+ mem_heap_t* heap,
+ ulint n);
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+ mem_heap_t* heap,
+ ulint n);
+
+/** Returns a pointer to the heap top.
+@param[in] heap memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+ mem_heap_t* heap);
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in] heap heap from which to free
+@param[in] old_top pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+ mem_heap_t* heap,
+ byte* old_top);
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in] heap heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+ mem_heap_t* heap);
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in] heap memory heap
+@param[in] n size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+ mem_heap_t* heap,
+ ulint n);
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap); /*!< in: heap */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str); /*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len); /*!< in: length of str, in bytes */
+
+/** Duplicate a block of data, allocated from a memory heap.
+@param[in] heap memory heap where string is allocated
+@param[in] data block of data to be copied
+@param[in] len length of data, in bytes
+@return own: a copy of data */
+inline
+void*
+mem_heap_dup(mem_heap_t* heap, const void* data, size_t len)
+{
+ ut_ad(data || !len);
+ return UNIV_LIKELY(data != NULL)
+ ? memcpy(mem_heap_alloc(heap, len), data, len)
+ : NULL;
+}
+
+/** Duplicate a NUL-terminated string, allocated from a memory heap.
+@param[in] heap memory heap where string is allocated
+@param[in] str string to be copied
+@return own: a copy of the string */
+inline
+char*
+mem_heap_strdup(mem_heap_t* heap, const char* str)
+{
+ return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
+}
+
+/** Duplicate a string, allocated from a memory heap.
+@param[in] heap memory heap where string is allocated
+@param[in] str string to be copied
+@param[in] len length of str, in bytes
+@return own: a NUL-terminated copy of str */
+inline
+char*
+mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len)
+{
+ char* s = static_cast<char*>(mem_heap_alloc(heap, len + 1));
+ s[len] = 0;
+ return(static_cast<char*>(memcpy(s, str, len)));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+char*
+mem_heap_strcat(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* s1, /*!< in: string 1 */
+ const char* s2); /*!< in: string 2 */
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+char*
+mem_heap_printf(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* format, /*!< in: format string */
+ ...) MY_ATTRIBUTE ((format (printf, 2, 3)));
+
+#ifdef UNIV_DEBUG
+/** Validates the contents of a memory heap.
+Asserts that the memory heap is consistent
+@param[in] heap Memory heap to validate */
+void
+mem_heap_validate(
+ const mem_heap_t* heap);
+
+#endif /* UNIV_DEBUG */
+
+/*#######################################################################*/
+
+/** The info structure stored at the beginning of a heap block */
+struct mem_block_info_t {
+#ifdef UNIV_DEBUG
+ char file_name[8];/* file name where the mem heap was created */
+ unsigned line; /*!< line number where the mem heap was created */
+#endif /* UNIV_DEBUG */
+ UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+ the list this is the base node of the list of blocks;
+ in subsequent blocks this is undefined */
+ UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+ and prev in the list. The first block allocated
+ to the heap is also the first block in this list,
+ though it also contains the base node of the list. */
+ ulint len; /*!< physical length of this block in bytes */
+ ulint total_size; /*!< physical length in bytes of all blocks
+ in the heap. This is defined only in the base
+ node and is set to ULINT_UNDEFINED in others. */
+ ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or
+ MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+ ulint free; /*!< offset in bytes of the first free position for
+ user data in the block */
+ ulint start; /*!< the value of the struct field 'free' at the
+ creation of the block */
+
+ void* free_block;
+ /* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+ and this is the heap root, this can contain an
+ allocated buffer frame, which can be appended as a
+ free block to the heap, if we need more space;
+ otherwise, this is NULL */
+ void* buf_block;
+ /* if this block has been allocated from the buffer
+ pool, this contains the buf_block_t handle;
+ otherwise, this is NULL */
+};
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE UT_CALC_ALIGN(sizeof(mem_block_info_t),\
+ UNIV_MEM_ALIGNMENT)
+
+#include "mem0mem.inl"
+#endif
diff --git a/storage/innobase/include/mem0mem.inl b/storage/innobase/include/mem0mem.inl
new file mode 100644
index 00000000..9906daf3
--- /dev/null
+++ b/storage/innobase/include/mem0mem.inl
@@ -0,0 +1,468 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0new.h"
+
+#ifdef UNIV_DEBUG
+# define mem_heap_create_block(heap, n, type, file_name, line) \
+ mem_heap_create_block_func(heap, n, file_name, line, type)
+# define mem_heap_create_at(N, file_name, line) \
+ mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC)
+#else /* UNIV_DEBUG */
+# define mem_heap_create_block(heap, n, type, file_name, line) \
+ mem_heap_create_block_func(heap, n, type)
+# define mem_heap_create_at(N, file_name, line) \
+ mem_heap_create_func(N, MEM_HEAP_DYNAMIC)
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+ mem_heap_t* heap, /*!< in: memory heap or NULL if first block
+ should be created */
+ ulint n, /*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+ const char* file_name,/*!< in: file name where created */
+ unsigned line, /*!< in: line where created */
+#endif /* UNIV_DEBUG */
+ ulint type); /*!< in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /*!< in: heap */
+ mem_block_t* block); /*!< in: block to free */
+
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap); /*!< in: heap */
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes needed
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_add_block(
+ mem_heap_t* heap,
+ ulint n);
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+ ut_ad(len > 0);
+
+ block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+ return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+ return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+ ut_ad(free > 0);
+ ut_ad(free <= mem_block_get_len(block));
+
+ block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+ return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+ ut_ad(start > 0);
+
+ block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+ return(block->start);
+}
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+ mem_heap_t* heap,
+ ulint n)
+{
+ ut_ad(heap);
+ ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+ return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+ mem_heap_t* heap,
+ ulint n)
+{
+ mem_block_t* block;
+ byte* buf;
+ ulint free;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ n += REDZONE_SIZE;
+
+ ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+ /* Check if there is enough space in block. If not, create a new
+ block to the heap */
+
+ if (mem_block_get_len(block)
+ < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+ block = mem_heap_add_block(heap, n);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+ }
+
+ free = mem_block_get_free(block);
+
+ buf = (byte*) block + free;
+
+ mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+ buf = buf + REDZONE_SIZE;
+ MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE);
+ return(buf);
+}
+
+/** Returns a pointer to the heap top.
+@param[in] heap memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+ mem_heap_t* heap)
+{
+ mem_block_t* block;
+ byte* buf;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*) block + mem_block_get_free(block);
+
+ return(buf);
+}
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in] heap heap from which to free
+@param[in] old_top pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+ mem_heap_t* heap,
+ byte* old_top)
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ ut_d(mem_heap_validate(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ while (block != NULL) {
+ if (((byte*) block + mem_block_get_free(block) >= old_top)
+ && ((byte*) block <= old_top)) {
+ /* Found the right block */
+
+ break;
+ }
+
+ /* Store prev_block value before freeing the current block
+ (the current block will be erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+
+ ut_ad(block);
+
+ /* Set the free field of block */
+ mem_block_set_free(block,
+ ulint(old_top - reinterpret_cast<byte*>(block)));
+
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+ MEM_NOACCESS(old_top, (byte*) block + block->len - old_top);
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ }
+}
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in] heap heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+ mem_heap_t* heap)
+{
+ mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
+
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+}
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in] heap memory heap
+@param[in] n size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+ mem_heap_t* heap,
+ ulint n)
+{
+ mem_block_t* block;
+ byte* buf;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+ return((void*) buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: size of the topmost element */
+{
+ mem_block_t* block;
+
+ n += REDZONE_SIZE;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* Subtract the free field of block */
+ mem_block_set_free(block, mem_block_get_free(block)
+ - MEM_SPACE_NEEDED(n));
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ } else {
+ MEM_NOACCESS((byte*) block + mem_block_get_free(block), n);
+ }
+}
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in] size Desired start block size.
+@param[in] file_name File name where created
+@param[in] line Line where created
+@param[in] type Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+ ulint size,
+#ifdef UNIV_DEBUG
+ const char* file_name,
+ unsigned line,
+#endif /* UNIV_DEBUG */
+ ulint type)
+{
+ mem_block_t* block;
+
+ if (!size) {
+ size = MEM_BLOCK_START_SIZE;
+ }
+
+ block = mem_heap_create_block(NULL, size, type, file_name, line);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+
+ /* The first block should not be in buffer pool,
+ because it might be relocated to resize buffer pool. */
+ ut_ad(block->buf_block == NULL);
+
+ UT_LIST_INIT(block->base, &mem_block_t::list);
+
+ /* Add the created block itself as the first block in the list */
+ UT_LIST_ADD_FIRST(block->base, block);
+
+ return(block);
+}
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in] heap Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+ mem_heap_t* heap)
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+
+ while (block != NULL) {
+ /* Store the contents of info before freeing current block
+ (it is erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap) /*!< in: heap */
+{
+ ulint size = heap->total_size;
+
+ if (heap->free_block) {
+ size += srv_page_size;
+ }
+
+ return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str) /*!< in: string to be copied */
+{
+ ulint len = strlen(str) + 1;
+ return(static_cast<char*>(memcpy(ut_malloc_nokey(len), str, len)));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len) /*!< in: length of str, in bytes */
+{
+ char* s = static_cast<char*>(ut_malloc_nokey(len + 1));
+ s[len] = 0;
+ return(static_cast<char*>(memcpy(s, str, len)));
+}
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
new file mode 100644
index 00000000..e2419309
--- /dev/null
+++ b/storage/innobase/include/mtr0log.h
@@ -0,0 +1,637 @@
+/*****************************************************************************
+
+Copyright (c) 2019, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+@file include/mtr0log.h
+Mini-transaction log record encoding and decoding
+*******************************************************/
+
+#pragma once
+#include "mtr0mtr.h"
+
+/** The smallest invalid page identifier for persistent tablespaces */
+constexpr page_id_t end_page_id{SRV_SPACE_ID_UPPER_BOUND, 0};
+
+/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_2BYTE= 1 << 7;
+/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14);
+/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21);
+/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28);
+
+/** Error from mlog_decode_varint() */
+constexpr uint32_t MLOG_DECODE_ERROR= ~0U;
+
+/** Decode the length of a variable-length encoded integer.
+@param first first byte of the encoded integer
+@return the length, in bytes */
+inline uint8_t mlog_decode_varint_length(byte first)
+{
+ uint8_t len= 1;
+ for (; first & 0x80; len++, first= static_cast<uint8_t>(first << 1));
+ return len;
+}
+
+/** Decode an integer in a redo log record.
+@param log redo log record buffer
+@return the decoded integer
+@retval MLOG_DECODE_ERROR on error */
+template<typename byte_pointer>
+inline uint32_t mlog_decode_varint(const byte_pointer log)
+{
+ uint32_t i= *log;
+ if (i < MIN_2BYTE)
+ return i;
+ if (i < 0xc0)
+ return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]);
+ if (i < 0xe0)
+ return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]);
+ if (i < 0xf0)
+ return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 |
+ uint32_t{log[2]} << 8 | log[3]);
+ if (i == 0xf0)
+ {
+ i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 |
+ uint32_t{log[3]} << 8 | log[4];
+ if (i <= ~MIN_5BYTE)
+ return MIN_5BYTE + i;
+ }
+ return MLOG_DECODE_ERROR;
+}
+
+/** Encode an integer in a redo log record.
+@param log redo log record buffer
+@param i the integer to encode
+@return end of the encoded integer */
+inline byte *mlog_encode_varint(byte *log, size_t i)
+{
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+ if (i < MIN_2BYTE)
+ {
+ }
+ else if (i < MIN_3BYTE)
+ {
+ i-= MIN_2BYTE;
+ static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility");
+ *log++= 0x80 | static_cast<byte>(i >> 8);
+ }
+ else if (i < MIN_4BYTE)
+ {
+ i-= MIN_3BYTE;
+ static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility");
+ *log++= 0xc0 | static_cast<byte>(i >> 16);
+ goto last2;
+ }
+ else if (i < MIN_5BYTE)
+ {
+ i-= MIN_4BYTE;
+ static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility");
+ *log++= 0xe0 | static_cast<byte>(i >> 24);
+ goto last3;
+ }
+ else
+ {
+ ut_ad(i < MLOG_DECODE_ERROR);
+ i-= MIN_5BYTE;
+ *log++= 0xf0;
+ *log++= static_cast<byte>(i >> 24);
+last3:
+ *log++= static_cast<byte>(i >> 16);
+last2:
+ *log++= static_cast<byte>(i >> 8);
+ }
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ *log++= static_cast<byte>(i);
+ return log;
+}
+
+/** Determine the length of a log record.
+@param log start of log record
+@param end end of the log record buffer
+@return the length of the record, in bytes
+@retval 0 if the log extends past the end
+@retval MLOG_DECODE_ERROR if the record is corrupted */
+inline uint32_t mlog_decode_len(const byte *log, const byte *end)
+{
+ ut_ad(log < end);
+ uint32_t i= *log;
+ if (!i)
+ return 0; /* end of mini-transaction */
+ if (~i & 15)
+ return (i & 15) + 1; /* 1..16 bytes */
+ if (UNIV_UNLIKELY(++log == end))
+ return 0; /* end of buffer */
+ i= *log;
+ if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */
+ return 16 + i;
+ if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */
+ {
+ if (UNIV_UNLIKELY(log + 1 == end))
+ return 0; /* end of buffer */
+ return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]);
+ }
+ if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */
+ {
+ if (UNIV_UNLIKELY(log + 2 == end))
+ return 0; /* end of buffer */
+ return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 |
+ static_cast<uint32_t>(log[1]) << 8 | log[2]);
+ }
+ /* 1,065,103 bytes per log record ought to be enough for everyone */
+ return MLOG_DECODE_ERROR;
+}
+
+/** Write 1, 2, 4, or 8 bytes to a file page.
+@param[in] block file page
+@param[in,out] ptr pointer in file page
+@param[in] val value to write
+@tparam l number of bytes to write
+@tparam w write request type
+@tparam V type of val
+@return whether any log was written */
+template<unsigned l,mtr_t::write_type w,typename V>
+inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
+{
+ ut_ad(ut_align_down(ptr, srv_page_size) == block.page.frame);
+ static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
+ byte buf[l];
+
+ switch (l) {
+ case 1:
+ ut_ad(val == static_cast<byte>(val));
+ buf[0]= static_cast<byte>(val);
+ break;
+ case 2:
+ ut_ad(val == static_cast<uint16_t>(val));
+ mach_write_to_2(buf, static_cast<uint16_t>(val));
+ break;
+ case 4:
+ ut_ad(val == static_cast<uint32_t>(val));
+ mach_write_to_4(buf, static_cast<uint32_t>(val));
+ break;
+ case 8:
+ mach_write_to_8(buf, val);
+ break;
+ }
+ byte *p= static_cast<byte*>(ptr);
+ const byte *const end= p + l;
+ if (w != FORCED && is_logged())
+ {
+ const byte *b= buf;
+ while (*p++ == *b++)
+ {
+ if (p == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return false;
+ }
+ }
+ p--;
+ }
+ ::memcpy(ptr, buf, l);
+ memcpy_low(block, static_cast<uint16_t>
+ (ut_align_offset(p, srv_page_size)), p, end - p);
+ return true;
+}
+
+/** Log an initialization of a string of bytes.
+@param[in] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write
+@param[in] val the data byte to write */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
+{
+ ut_ad(len);
+ set_modified(b);
+ if (!is_logged())
+ return;
+
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1);
+ byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs);
+ l= mlog_encode_varint(l, len);
+ *l++= val;
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a string of bytes.
+@param[in,out] b buffer page
+@param[in] ofs byte offset from block->frame
+@param[in] len length of the data to write
+@param[in] val the data byte to write */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
+{
+ ut_ad(ofs <= ulint(srv_page_size));
+ ut_ad(ofs + len <= ulint(srv_page_size));
+ ::memset(ofs + b->page.frame, val, len);
+ memset(*b, ofs, len, val);
+}
+
+/** Log an initialization of a repeating string of bytes.
+@param[in] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write, in bytes
+@param[in] str the string to write
+@param[in] size size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
+ const void *str, size_t size)
+{
+ ut_ad(size);
+ ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+ set_modified(b);
+ if (!is_logged())
+ return;
+
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+ byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs);
+ l= mlog_encode_varint(l, len);
+ ::memcpy(l, str, size);
+ l+= size;
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a repeating string of bytes.
+@param[in,out] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write, in bytes
+@param[in] str the string to write
+@param[in] size size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
+ const void *str, size_t size)
+{
+ ut_ad(ofs <= ulint(srv_page_size));
+ ut_ad(ofs + len <= ulint(srv_page_size));
+ ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+ size_t s= 0;
+ while (s < len)
+ {
+ ::memcpy(ofs + s + b->page.frame, str, size);
+ s+= len;
+ }
+ ::memcpy(ofs + s + b->page.frame, str, len - s);
+ memset(*b, ofs, len, str, size);
+}
+
+/** Log a write of a byte string to a page.
+@param[in] b buffer page
+@param[in] offset byte offset from b->frame
+@param[in] str the data to write
+@param[in] len length of the data to write */
+inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+ ut_ad(len);
+ ut_ad(offset <= ulint(srv_page_size));
+ ut_ad(offset + len <= ulint(srv_page_size));
+ memcpy_low(b, uint16_t(offset), &b.page.frame[offset], len);
+}
+
+/** Log a write of a byte string to a page.
+@param block page
+@param offset byte offset within page
+@param data data to be written
+@param len length of the data, in bytes */
+inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
+ const void *data, size_t len)
+{
+ ut_ad(len);
+ set_modified(block);
+ if (!is_logged())
+ return;
+ if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
+ {
+ byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true,
+ offset);
+ ::memcpy(end, data, len);
+ m_log.close(end + len);
+ }
+ else
+ {
+ m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false,
+ offset));
+ m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+ }
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Log that a string of bytes was copied from the same page.
+@param[in] b buffer page
+@param[in] d destination offset within the page
+@param[in] s source offset within the page
+@param[in] len length of the data to copy */
+inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
+{
+ ut_ad(d >= 8);
+ ut_ad(s >= 8);
+ ut_ad(len);
+ ut_ad(s <= ulint(srv_page_size));
+ ut_ad(s + len <= ulint(srv_page_size));
+ ut_ad(s != d);
+ ut_ad(d <= ulint(srv_page_size));
+ ut_ad(d + len <= ulint(srv_page_size));
+
+ set_modified(b);
+ if (!is_logged())
+ return;
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+ /* The source offset is encoded relative to the destination offset,
+ with the sign in the least significant bit. */
+ if (s > d)
+ s= (s - d) << 1;
+ else
+ s= (d - s) << 1 | 1;
+ /* The source offset 0 is not possible. */
+ s-= 1 << 1;
+ size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3);
+ byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d);
+ l= mlog_encode_varint(l, len);
+ l= mlog_encode_varint(l, s);
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(d + len);
+}
+
+/**
+Write a log record.
+@tparam type redo log record type
+@param id persistent page identifier
+@param bpage buffer pool page, or nullptr
+@param len number of additional bytes to write
+@param alloc whether to allocate the additional bytes
+@param offset byte offset, or 0 if the record type does not allow one
+@return end of mini-transaction log, minus len */
+template<byte type>
+inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
+ size_t len, bool alloc, size_t offset)
+{
+ static_assert(!(type & 15) && type != RESERVED &&
+ type <= FILE_CHECKPOINT, "invalid type");
+ ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
+ ut_ad(!bpage || bpage->id() == id);
+ ut_ad(id < end_page_id);
+ constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE;
+ constexpr bool have_offset= type == WRITE || type == MEMSET ||
+ type == MEMMOVE;
+ static_assert(!have_offset || have_len, "consistency");
+ ut_ad(have_len || len == 0);
+ ut_ad(have_len || !alloc);
+ ut_ad(have_offset || offset == 0);
+ ut_ad(offset + len <= srv_page_size);
+ static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
+ ut_ad(type == FREE_PAGE || type == OPTION || (type == EXTENDED && !bpage) ||
+ memo_contains_flagged(bpage, MTR_MEMO_MODIFY));
+ size_t max_len;
+ if (!have_len)
+ max_len= 1 + 5 + 5;
+ else if (!have_offset)
+ max_len= bpage && m_last == bpage
+ ? 1 + 3
+ : 1 + 3 + 5 + 5;
+ else if (bpage && m_last == bpage && m_last_offset <= offset)
+ {
+ /* Encode the offset relative from m_last_offset. */
+ offset-= m_last_offset;
+ max_len= 1 + 3 + 3;
+ }
+ else
+ max_len= 1 + 3 + 5 + 5 + 3;
+ byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len);
+ byte *end= log_ptr + 1;
+ const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0;
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ m_last= bpage;
+ }
+ if (have_offset)
+ {
+ byte* oend= mlog_encode_varint(end, offset);
+ if (oend + len > &log_ptr[16])
+ {
+ len+= oend - log_ptr - 15;
+ if (len >= MIN_3BYTE - 1)
+ len+= 2;
+ else if (len >= MIN_2BYTE)
+ len++;
+
+ *log_ptr= type | same_page;
+ end= mlog_encode_varint(log_ptr + 1, len);
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ }
+ end= mlog_encode_varint(end, offset);
+ return end;
+ }
+ else
+ end= oend;
+ }
+ else if (len >= 3 && end + len > &log_ptr[16])
+ {
+ len+= end - log_ptr - 15;
+ if (len >= MIN_3BYTE - 1)
+ len+= 2;
+ else if (len >= MIN_2BYTE)
+ len++;
+
+ end= log_ptr;
+ *end++= type | same_page;
+ end= mlog_encode_varint(end, len);
+
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ }
+ return end;
+ }
+
+ ut_ad(end + len >= &log_ptr[1] + !same_page);
+ ut_ad(end + len <= &log_ptr[16]);
+ ut_ad(end <= &log_ptr[max_len]);
+ *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1);
+ ut_ad(*log_ptr & 15);
+ return end;
+}
+
+/** Write a byte string to a page.
+@param[in] b buffer page
+@param[in] dest destination within b.frame
+@param[in] str the data to write
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len)
+{
+ ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame);
+ char *d= static_cast<char*>(dest);
+ const char *s= static_cast<const char*>(str);
+ if (w != FORCED && is_logged())
+ {
+ ut_ad(len);
+ const char *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ memcpy(b, ut_align_offset(d, srv_page_size), len);
+}
+
+/** Write an EXTENDED log record.
+@param block buffer pool page
+@param type extended record subtype; @see mrec_ext_t */
+inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
+{
+ set_modified(block);
+ if (!is_logged())
+ return;
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
+ *l++= type;
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for partly initializing a B-tree or R-tree page.
+@param block B-tree or R-tree page
+@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+inline void mtr_t::page_create(const buf_block_t &block, bool comp)
+{
+ static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding");
+ static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding");
+ log_write_extended(block, comp);
+}
+
+/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+@param block B-tree or R-tree page
+@param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_OLD_INFIMUM */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
+{
+ ut_ad(!block.zip_size());
+ ut_ad(prev_rec < block.physical_size());
+ set_modified(block);
+ if (!is_logged())
+ return;
+ size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+ ut_d(byte *end= l + len);
+ *l++= DELETE_ROW_FORMAT_REDUNDANT;
+ l= mlog_encode_varint(l, prev_rec);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+@param block B-tree or R-tree page
+@param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_NEW_INFIMUM
+@param prev_rec the predecessor of the record to delete
+@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size data payload size, in bytes */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
+ size_t hdr_size, size_t data_size)
+{
+ ut_ad(!block.zip_size());
+ set_modified(block);
+ ut_ad(hdr_size < MIN_3BYTE);
+ ut_ad(prev_rec < block.physical_size());
+ ut_ad(data_size < block.physical_size());
+ if (!is_logged())
+ return;
+ size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+ len+= hdr_size < MIN_2BYTE ? 1 : 2;
+ len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3;
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+ ut_d(byte *end= l + len);
+ *l++= DELETE_ROW_FORMAT_DYNAMIC;
+ l= mlog_encode_varint(l, prev_rec);
+ l= mlog_encode_varint(l, hdr_size);
+ l= mlog_encode_varint(l, data_size);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for initializing an undo log page.
+@param block undo page */
+inline void mtr_t::undo_create(const buf_block_t &block)
+{
+ log_write_extended(block, UNDO_INIT);
+}
+
+/** Write log for appending an undo log record.
+@param block undo page
+@param data record within the undo page
+@param len length of the undo record, in bytes */
+inline void mtr_t::undo_append(const buf_block_t &block,
+ const void *data, size_t len)
+{
+ ut_ad(len > 2);
+ set_modified(block);
+ if (!is_logged())
+ return;
+ const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+ byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
+ if (UNIV_LIKELY(small))
+ {
+ *end++= UNDO_APPEND;
+ ::memcpy(end, data, len);
+ m_log.close(end + len);
+ }
+ else
+ {
+ m_log.close(end);
+ *m_log.push<byte*>(1)= UNDO_APPEND;
+ m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+ }
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Trim the end of a tablespace.
+@param id first page identifier that will not be in the file */
+inline void mtr_t::trim_pages(const page_id_t id)
+{
+ if (!is_logged())
+ return;
+ byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
+ *l++= TRIM_PAGES;
+ m_log.close(l);
+ set_trim_pages();
+}
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000..841cfab1
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.h
@@ -0,0 +1,780 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "fil0fil.h"
+#include "dyn0buf.h"
+#include "buf0buf.h"
+#include "small_vector.h"
+
+/** Start a mini-transaction. */
+#define mtr_start(m) (m)->start()
+
+/** Commit a mini-transaction. */
+#define mtr_commit(m) (m)->commit()
+
+/** Change the logging mode of a mini-transaction.
+@return old mode */
+#define mtr_set_log_mode(m, d) (m)->set_log_mode((d))
+
+#ifdef UNIV_PFS_RWLOCK
+# define mtr_s_lock_index(i,m) (m)->s_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_x_lock_index(i,m) (m)->x_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_sx_lock_index(i,m) (m)->u_lock(__FILE__, __LINE__, &(i)->lock)
+#else
+# define mtr_s_lock_index(i,m) (m)->s_lock(&(i)->lock)
+# define mtr_x_lock_index(i,m) (m)->x_lock(&(i)->lock)
+# define mtr_sx_lock_index(i,m) (m)->u_lock(&(i)->lock)
+#endif
+
+/** Mini-transaction memo stack slot. */
+struct mtr_memo_slot_t
+{
+ /** pointer to the object */
+ void *object;
+ /** type of the stored object */
+ mtr_memo_type_t type;
+
+ /** Release the object */
+ void release() const;
+};
+
+/** Mini-transaction handle and buffer */
+struct mtr_t {
+ mtr_t();
+ ~mtr_t();
+
+ /** Start a mini-transaction. */
+ void start();
+
+ /** Commit the mini-transaction. */
+ void commit();
+
+ /** Release latches of unmodified buffer pages.
+ @param begin first slot to release
+ @param end last slot to release, or get_savepoint() */
+ void rollback_to_savepoint(ulint begin, ulint end);
+
+ /** Release latches of unmodified buffer pages.
+ @param begin first slot to release */
+ void rollback_to_savepoint(ulint begin)
+ { rollback_to_savepoint(begin, m_memo.size()); }
+
+ /** Release the last acquired buffer page latch. */
+ void release_last_page()
+ { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
+
+ /** Commit a mini-transaction that is shrinking a tablespace.
+ @param space tablespace that is being shrunk */
+ ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
+
+ /** Commit a mini-transaction that is deleting or renaming a file.
+ @param space tablespace that is being renamed or deleted
+ @param name new file name (nullptr=the file will be deleted)
+ @return whether the operation succeeded */
+ ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name);
+
+ /** Commit a mini-transaction that did not modify any pages,
+ but generated some redo log on a higher level, such as
+ FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+ The caller must hold exclusive log_sys.latch.
+ This is to be used at log_checkpoint().
+ @param checkpoint_lsn the log sequence number of a checkpoint, or 0
+ @return current LSN */
+ lsn_t commit_files(lsn_t checkpoint_lsn= 0);
+
+ /** @return mini-transaction savepoint (current size of m_memo) */
+ ulint get_savepoint() const
+ {
+ ut_ad(is_active());
+ return m_memo.size();
+ }
+
+ /** Get the block at a savepoint */
+ buf_block_t *at_savepoint(ulint savepoint) const
+ {
+ ut_ad(is_active());
+ const mtr_memo_slot_t &slot= m_memo[savepoint];
+ ut_ad(slot.type < MTR_MEMO_S_LOCK);
+ ut_ad(slot.object);
+ return static_cast<buf_block_t*>(slot.object);
+ }
+
+ /** Try to get a block at a savepoint.
+ @param savepoint the savepoint right before the block was acquired
+ @return the block at the savepoint
+ @retval nullptr if no buffer block was registered at that savepoint */
+ buf_block_t *block_at_savepoint(ulint savepoint) const
+ {
+ ut_ad(is_active());
+ const mtr_memo_slot_t &slot= m_memo[savepoint];
+ return slot.type < MTR_MEMO_S_LOCK
+ ? static_cast<buf_block_t*>(slot.object)
+ : nullptr;
+ }
+
+ /** Retrieve a page that has already been latched.
+ @param id page identifier
+ @param type page latch type
+ @return block
+ @retval nullptr if the block had not been latched yet */
+ buf_block_t *get_already_latched(const page_id_t id, mtr_memo_type_t type)
+ const;
+
+ /** @return the logging mode */
+ mtr_log_t get_log_mode() const
+ {
+ static_assert(MTR_LOG_ALL == 0, "efficiency");
+ return static_cast<mtr_log_t>(m_log_mode);
+ }
+
+ /** @return whether log is to be written for changes */
+ bool is_logged() const
+ {
+ static_assert(MTR_LOG_ALL == 0, "efficiency");
+ static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency");
+ static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency");
+ return !(m_log_mode & MTR_LOG_NONE);
+ }
+
+ /** Change the logging mode.
+ @param mode logging mode
+ @return old mode */
+ mtr_log_t set_log_mode(mtr_log_t mode)
+ {
+ const mtr_log_t old_mode= get_log_mode();
+ m_log_mode= mode & 3;
+ return old_mode;
+ }
+
+ /** Set the log mode of a sub-minitransaction
+ @param mtr parent mini-transaction */
+ void set_log_mode_sub(const mtr_t &mtr)
+ {
+ ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO);
+ m_log_mode= mtr.m_log_mode | MTR_LOG_SUB;
+ static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, "");
+ }
+
+ /** Check if we are holding a block latch in exclusive mode
+ @param block buffer pool block to search for */
+ bool have_x_latch(const buf_block_t &block) const;
+
+ /** Check if we are holding a block latch in S or U mode
+ @param block buffer pool block to search for */
+ bool have_u_or_x_latch(const buf_block_t &block) const;
+
+ /** Copy the tablespaces associated with the mini-transaction
+ (needed for generating FILE_MODIFY records)
+ @param[in] mtr mini-transaction that may modify
+ the same set of tablespaces as this one */
+ void set_spaces(const mtr_t& mtr)
+ {
+ ut_ad(!m_user_space_id);
+ ut_ad(!m_user_space);
+
+ ut_d(m_user_space_id = mtr.m_user_space_id);
+ m_user_space = mtr.m_user_space;
+ }
+
+ /** Set the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space_id user or system tablespace ID
+ @return the tablespace */
+ fil_space_t* set_named_space_id(uint32_t space_id)
+ {
+ ut_ad(!m_user_space_id);
+ ut_d(m_user_space_id = space_id);
+ if (!space_id) {
+ return fil_system.sys_space;
+ } else {
+ ut_ad(m_user_space_id == space_id);
+ ut_ad(!m_user_space);
+ m_user_space = fil_space_get(space_id);
+ ut_ad(m_user_space);
+ return m_user_space;
+ }
+ }
+
+ /** Set the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space user or system tablespace */
+ void set_named_space(fil_space_t* space)
+ {
+ ut_ad(!m_user_space_id);
+ ut_d(m_user_space_id = space->id);
+ if (space->id) {
+ m_user_space = space;
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ /** Check the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space tablespace
+ @return whether the mini-transaction is associated with the space */
+ bool is_named_space(uint32_t space) const;
+ /** Check the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space tablespace
+ @return whether the mini-transaction is associated with the space */
+ bool is_named_space(const fil_space_t* space) const;
+#endif /* UNIV_DEBUG */
+
+ /** Acquire a tablespace X-latch.
+ @param space_id tablespace ID
+ @return the tablespace object (never NULL) */
+ fil_space_t *x_lock_space(uint32_t space_id);
+
+ /** Acquire a shared rw-latch. */
+ void s_lock(
+#ifdef UNIV_PFS_RWLOCK
+ const char *file, unsigned line,
+#endif
+ index_lock *lock)
+ {
+ lock->s_lock(SRW_LOCK_ARGS(file, line));
+ memo_push(lock, MTR_MEMO_S_LOCK);
+ }
+
+ /** Acquire an exclusive rw-latch. */
+ void x_lock(
+#ifdef UNIV_PFS_RWLOCK
+ const char *file, unsigned line,
+#endif
+ index_lock *lock)
+ {
+ lock->x_lock(SRW_LOCK_ARGS(file, line));
+ memo_push(lock, MTR_MEMO_X_LOCK);
+ }
+
+ /** Acquire an update latch. */
+ void u_lock(
+#ifdef UNIV_PFS_RWLOCK
+ const char *file, unsigned line,
+#endif
+ index_lock *lock)
+ {
+ lock->u_lock(SRW_LOCK_ARGS(file, line));
+ memo_push(lock, MTR_MEMO_SX_LOCK);
+ }
+
+ /** Acquire an exclusive tablespace latch.
+ @param space tablespace */
+ void x_lock_space(fil_space_t *space);
+
+ /** Release an index latch. */
+ void release(const index_lock &lock) { release(&lock); }
+ /** Release a latch to an unmodified page. */
+ void release(const buf_block_t &block) { release(&block); }
+private:
+ /** Release an unmodified object. */
+ void release(const void *object);
+public:
+ /** Mark the given latched page as modified.
+ @param block page that will be modified */
+ void set_modified(const buf_block_t &block);
+
+ /** Set the state to not-modified. This will not log the changes.
+ This is only used during redo log apply, to avoid logging the changes. */
+ void discard_modifications() { m_modifications= false; }
+
+ /** Get the LSN of commit().
+ @return the commit LSN
+ @retval 0 if the transaction only modified temporary tablespaces */
+ lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
+
+ /** Note that we are inside the change buffer code. */
+ void enter_ibuf() { m_inside_ibuf= true; }
+
+ /** Note that we have exited from the change buffer code. */
+ void exit_ibuf() { m_inside_ibuf= false; }
+
+ /** @return true if we are inside the change buffer code */
+ bool is_inside_ibuf() const { return m_inside_ibuf; }
+
+ /** Note that some pages have been freed */
+ void set_trim_pages() { m_trim_pages= true; }
+
+ /** Latch a buffer pool block.
+ @param block block to be latched
+ @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ void page_lock(buf_block_t *block, ulint rw_latch);
+
+ /** Acquire a latch on a buffer-fixed buffer pool block.
+ @param savepoint savepoint location of the buffer-fixed block
+ @param rw_latch latch to acquire */
+ void upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch);
+
+ /** Register a change to the page latch state. */
+ void lock_register(ulint savepoint, mtr_memo_type_t type)
+ {
+ mtr_memo_slot_t &slot= m_memo[savepoint];
+ ut_ad(slot.type <= MTR_MEMO_BUF_FIX);
+ ut_ad(type < MTR_MEMO_S_LOCK);
+ slot.type= type;
+ }
+
+ /** Upgrade U locks on a block to X */
+ void page_lock_upgrade(const buf_block_t &block);
+
+ /** Upgrade index U lock to X */
+ ATTRIBUTE_COLD void index_lock_upgrade();
+
+ /** Check if we are holding tablespace latch
+ @param space tablespace to search for
+ @return whether space.latch is being held */
+ bool memo_contains(const fil_space_t& space) const
+ MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+ /** Check if we are holding an rw-latch in this mini-transaction
+ @param lock latch to search for
+ @param type held latch type
+ @return whether (lock,type) is contained */
+ bool memo_contains(const index_lock &lock, mtr_memo_type_t type) const
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Check if memo contains an index or buffer block latch.
+ @param object object to search
+ @param flags specify types of object latches
+ @return true if contains */
+ bool memo_contains_flagged(const void *object, ulint flags) const
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+ /** Check if memo contains the given page.
+ @param ptr pointer to within page frame
+ @param flags types latch to look for
+ @return the block
+ @retval nullptr if not found */
+ buf_block_t *memo_contains_page_flagged(const byte *ptr, ulint flags) const;
+
+ /** @return whether this mini-transaction modifies persistent data */
+ bool has_modifications() const { return m_modifications; }
+#endif /* UNIV_DEBUG */
+
+ /** Push a buffer page to an the memo.
+ @param block buffer block
+ @param type object type: MTR_MEMO_S_LOCK, ... */
+ void memo_push(buf_block_t *block, mtr_memo_type_t type)
+ __attribute__((nonnull))
+ {
+ ut_ad(is_active());
+ ut_ad(type <= MTR_MEMO_PAGE_SX_MODIFY);
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.in_file());
+#ifdef UNIV_DEBUG
+ switch (type) {
+ case MTR_MEMO_PAGE_S_FIX:
+ ut_ad(block->page.lock.have_s());
+ break;
+ case MTR_MEMO_PAGE_X_FIX: case MTR_MEMO_PAGE_X_MODIFY:
+ ut_ad(block->page.lock.have_x());
+ break;
+ case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_SX_MODIFY:
+ ut_ad(block->page.lock.have_u_or_x());
+ break;
+ case MTR_MEMO_BUF_FIX:
+ break;
+ case MTR_MEMO_MODIFY:
+ case MTR_MEMO_S_LOCK: case MTR_MEMO_X_LOCK: case MTR_MEMO_SX_LOCK:
+ case MTR_MEMO_SPACE_X_LOCK:
+ ut_ad("invalid type" == 0);
+ }
+#endif
+ if (!(type & MTR_MEMO_MODIFY));
+ else if (block->page.id().space() >= SRV_TMP_SPACE_ID)
+ {
+ block->page.set_temp_modified();
+ type= mtr_memo_type_t(type & ~MTR_MEMO_MODIFY);
+ }
+ else
+ {
+ m_modifications= true;
+ if (!m_made_dirty)
+ /* If we are going to modify a previously clean persistent page,
+ we must set m_made_dirty, so that commit() will acquire
+ log_sys.flush_order_mutex and insert the block into
+ buf_pool.flush_list. */
+ m_made_dirty= block->page.oldest_modification() <= 1;
+ }
+ m_memo.emplace_back(mtr_memo_slot_t{block, type});
+ }
+
+ /** Push an index lock or tablespace latch to the memo.
+ @param object index lock or tablespace latch
+ @param type object type: MTR_MEMO_S_LOCK, ... */
+ void memo_push(void *object, mtr_memo_type_t type) __attribute__((nonnull))
+ {
+ ut_ad(is_active());
+ ut_ad(type >= MTR_MEMO_S_LOCK);
+ m_memo.emplace_back(mtr_memo_slot_t{object, type});
+ }
+
+ /** @return the size of the log is empty */
+ size_t get_log_size() const { return m_log.size(); }
+ /** @return whether the log and memo are empty */
+ bool is_empty() const { return !get_savepoint() && !get_log_size(); }
+
+ /** Write an OPT_PAGE_CHECKSUM record. */
+ inline void page_checksum(const buf_page_t &bpage);
+
+ /** Write request types */
+ enum write_type
+ {
+ /** the page is guaranteed to always change */
+ NORMAL= 0,
+ /** optional: the page contents might not change */
+ MAYBE_NOP,
+ /** force a write, even if the page contents is not changing */
+ FORCED
+ };
+
+ /** Write 1, 2, 4, or 8 bytes to a file page.
+ @param[in] block file page
+ @param[in,out] ptr pointer in file page
+ @param[in] val value to write
+ @tparam l number of bytes to write
+ @tparam w write request type
+ @tparam V type of val
+ @return whether any log was written */
+ template<unsigned l,write_type w= NORMAL,typename V>
+ inline bool write(const buf_block_t &block, void *ptr, V val)
+ MY_ATTRIBUTE((nonnull));
+
+ /** Log a write of a byte string to a page.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write */
+ inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
+
+ /** Write a byte string to a page.
+ @param[in,out] b buffer page
+ @param[in] dest destination within b.frame
+ @param[in] str the data to write
+ @param[in] len length of the data to write
+ @tparam w write request type */
+ template<write_type w= NORMAL>
+ inline void memcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len);
+
+ /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
+ @param[in] b ROW_FORMAT=COMPRESSED index page
+ @param[in] offset byte offset from b.zip.data
+ @param[in] len length of the data to write */
+ inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len);
+
+ /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+ @param[in] b ROW_FORMAT=COMPRESSED index page
+ @param[in] dest destination within b.zip.data
+ @param[in] str the data to write
+ @param[in] len length of the data to write
+ @tparam w write request type */
+ template<write_type w= NORMAL>
+ inline void zmemcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len);
+
+ /** Log an initialization of a string of bytes.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write
+ @param[in] val the data byte to write */
+ inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
+
+ /** Initialize a string of bytes.
+ @param[in,out] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write
+ @param[in] val the data byte to write */
+ inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
+
+ /** Log an initialization of a repeating string of bytes.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write, in bytes
+ @param[in] str the string to write
+ @param[in] size size of str, in bytes */
+ inline void memset(const buf_block_t &b, ulint ofs, size_t len,
+ const void *str, size_t size);
+
+ /** Initialize a repeating string of bytes.
+ @param[in,out] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write, in bytes
+ @param[in] str the string to write
+ @param[in] size size of str, in bytes */
+ inline void memset(const buf_block_t *b, ulint ofs, size_t len,
+ const void *str, size_t size);
+
+ /** Log that a string of bytes was copied from the same page.
+ @param[in] b buffer page
+ @param[in] d destination offset within the page
+ @param[in] s source offset within the page
+ @param[in] len length of the data to copy */
+ inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
+
+ /** Initialize an entire page.
+ @param[in,out] b buffer page */
+ void init(buf_block_t *b);
+ /** Free a page.
+ @param space tablespace
+ @param offset offset of the page to be freed */
+ void free(const fil_space_t &space, uint32_t offset);
+ /** Write log for partly initializing a B-tree or R-tree page.
+ @param block B-tree or R-tree page
+ @param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+ inline void page_create(const buf_block_t &block, bool comp);
+
+ /** Write log for inserting a B-tree or R-tree record in
+ ROW_FORMAT=REDUNDANT.
+ @param block B-tree or R-tree page
+ @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+ @param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_OLD_INFIMUM
+ @param info_bits info_bits of the record
+ @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+ @param hdr_c number of common record header bytes with prev_rec
+ @param data_c number of common data bytes with prev_rec
+ @param hdr record header bytes to copy to the log
+ @param hdr_l number of copied record header bytes
+ @param data record payload bytes to copy to the log
+ @param data_l number of copied record data bytes */
+ inline void page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_bits,
+ ulint n_fields_s, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l);
+ /** Write log for inserting a B-tree or R-tree record in
+ ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+ @param block B-tree or R-tree page
+ @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+ @param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_NEW_INFIMUM
+ @param info_status rec_get_info_and_status_bits()
+ @param shift unless !reuse: number of bytes the PAGE_FREE is moving
+ @param hdr_c number of common record header bytes with prev_rec
+ @param data_c number of common data bytes with prev_rec
+ @param hdr record header bytes to copy to the log
+ @param hdr_l number of copied record header bytes
+ @param data record payload bytes to copy to the log
+ @param data_l number of copied record data bytes */
+ inline void page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_status,
+ ssize_t shift, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l);
+ /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+ @param block B-tree or R-tree page
+ @param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_OLD_INFIMUM */
+ inline void page_delete(const buf_block_t &block, ulint prev_rec);
+ /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+ @param block B-tree or R-tree page
+ @param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_NEW_INFIMUM
+ @param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+ @param data_size data payload size, in bytes */
+ inline void page_delete(const buf_block_t &block, ulint prev_rec,
+ size_t hdr_size, size_t data_size);
+
+ /** Write log for initializing an undo log page.
+ @param block undo page */
+ inline void undo_create(const buf_block_t &block);
+ /** Write log for appending an undo log record.
+ @param block undo page
+ @param data record within the undo page
+ @param len length of the undo record, in bytes */
+ inline void undo_append(const buf_block_t &block,
+ const void *data, size_t len);
+ /** Trim the end of a tablespace.
+ @param id first page identifier that will not be in the file */
+ inline void trim_pages(const page_id_t id);
+
+ /** Write a log record about a file operation.
+ @param type file operation
+ @param space_id tablespace identifier
+ @param path file path
+ @param new_path new file path for type=FILE_RENAME */
+ inline void log_file_op(mfile_type_t type, uint32_t space_id,
+ const char *path,
+ const char *new_path= nullptr);
+
+ /** Add freed page numbers to freed_pages */
+ void add_freed_offset(fil_space_t *space, uint32_t page)
+ {
+ ut_ad(is_named_space(space));
+ if (!m_freed_pages)
+ {
+ m_freed_pages= new range_set();
+ ut_ad(!m_freed_space);
+ m_freed_space= space;
+ }
+ else
+ ut_ad(m_freed_space == space);
+ m_freed_pages->add_value(page);
+ }
+
+ /** Determine the added buffer fix count of a block.
+ @param block block to be checked
+ @return number of buffer count added by this mtr */
+ uint32_t get_fix_count(const buf_block_t *block) const;
+
+ /** Note that log_sys.latch is no longer being held exclusively. */
+ void flag_wr_unlock() noexcept { ut_ad(m_latch_ex); m_latch_ex= false; }
+
+ /** type of page flushing is needed during commit() */
+ enum page_flush_ahead
+ {
+ /** no need to trigger page cleaner */
+ PAGE_FLUSH_NO= 0,
+ /** asynchronous flushing is needed */
+ PAGE_FLUSH_ASYNC,
+ /** furious flushing is needed */
+ PAGE_FLUSH_SYNC
+ };
+
+private:
+ /** Handle any pages that were freed during the mini-transaction. */
+ void process_freed_pages();
+ /** Release modified pages when no log was written. */
+ void release_unlogged();
+
+ /** Log a write of a byte string to a page.
+ @param block buffer page
+ @param offset byte offset within page
+ @param data data to be written
+ @param len length of the data, in bytes */
+ inline void memcpy_low(const buf_block_t &block, uint16_t offset,
+ const void *data, size_t len);
+ /**
+ Write a log record.
+ @tparam type redo log record type
+ @param id persistent page identifier
+ @param bpage buffer pool page, or nullptr
+ @param len number of additional bytes to write
+ @param alloc whether to allocate the additional bytes
+ @param offset byte offset, or 0 if the record type does not allow one
+ @return end of mini-transaction log, minus len */
+ template<byte type>
+ inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
+ size_t len= 0, bool alloc= false, size_t offset= 0);
+
+ /** Write an EXTENDED log record.
+ @param block buffer pool page
+ @param type extended record subtype; @see mrec_ext_t */
+ inline void log_write_extended(const buf_block_t &block, byte type);
+
+ /** Write a FILE_MODIFY record when a non-predefined persistent
+ tablespace was modified for the first time since fil_names_clear(). */
+ ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void name_write();
+
+ /** Encrypt the log */
+ ATTRIBUTE_NOINLINE void encrypt();
+
+ /** Append the redo log records to the redo log buffer.
+ @return {start_lsn,flush_ahead} */
+ std::pair<lsn_t,page_flush_ahead> do_write();
+
+ /** Append the redo log records to the redo log buffer.
+ @param len number of bytes to write
+ @return {start_lsn,flush_ahead} */
+ std::pair<lsn_t,page_flush_ahead> finish_write(size_t len);
+
+ /** Release all latches. */
+ void release();
+ /** Release the resources */
+ inline void release_resources();
+
+#ifdef UNIV_DEBUG
+public:
+ /** @return whether the mini-transaction is active */
+ bool is_active() const
+ { ut_ad(!m_commit || m_start); return m_start && !m_commit; }
+ /** @return whether the mini-transaction has been committed */
+ bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+ /** @return whether the mini-transaction is freeing an index tree */
+ bool is_freeing_tree() const { return m_freeing_tree; }
+ /** Notify that the mini-transaction is freeing an index tree */
+ void freeing_tree() { m_freeing_tree= true; }
+private:
+ /** whether start() has been called */
+ bool m_start= false;
+ /** whether commit() has been called */
+ bool m_commit= false;
+ /** whether freeing_tree() has been called */
+ bool m_freeing_tree= false;
+#endif
+private:
+ /** The page of the most recent m_log record written, or NULL */
+ const buf_page_t* m_last;
+ /** The current byte offset in m_last, or 0 */
+ uint16_t m_last_offset;
+
+ /** specifies which operations should be logged; default MTR_LOG_ALL */
+ uint16_t m_log_mode:2;
+
+ /** whether at least one persistent page was written to */
+ uint16_t m_modifications:1;
+
+ /** whether at least one previously clean buffer pool page was written to */
+ uint16_t m_made_dirty:1;
+
+ /** whether log_sys.latch is locked exclusively */
+ uint16_t m_latch_ex:1;
+
+ /** whether change buffer is latched; only needed in non-debug builds
+ to suppress some read-ahead operations, @see ibuf_inside() */
+ uint16_t m_inside_ibuf:1;
+
+ /** whether the pages has been trimmed */
+ uint16_t m_trim_pages:1;
+
+ /** CRC-32C of m_log */
+ uint32_t m_crc;
+
+#ifdef UNIV_DEBUG
+ /** Persistent user tablespace associated with the
+ mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
+ uint32_t m_user_space_id;
+#endif /* UNIV_DEBUG */
+
+ /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
+ small_vector<mtr_memo_slot_t, 16> m_memo;
+
+ /** mini-transaction log */
+ mtr_buf_t m_log;
+
+ /** user tablespace that is being modified by the mini-transaction */
+ fil_space_t* m_user_space;
+
+ /** LSN at commit time */
+ lsn_t m_commit_lsn;
+
+ /** tablespace where pages have been freed */
+ fil_space_t *m_freed_space= nullptr;
+ /** set of freed page ids */
+ range_set *m_freed_pages= nullptr;
+};
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
new file mode 100644
index 00000000..19db13a1
--- /dev/null
+++ b/storage/innobase/include/mtr0types.h
@@ -0,0 +1,347 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "buf0types.h"
+
+#include "ut0byte.h"
+
+struct mtr_t;
+
+/** Logging modes for a mini-transaction */
+enum mtr_log_t {
+ /** Default mode: log all operations modifying disk-based data */
+ MTR_LOG_ALL = 0,
+
+ /** Log no operations and dirty pages are not added to the flush list.
+ Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
+ MTR_LOG_NONE,
+
+ /** Log all operations, but do not write any OPT_PAGE_CHECKSUM
+ records because some of the modified pages were also modified
+ by another mini-transaction that did not write its log yet. */
+ MTR_LOG_SUB,
+
+ /** Don't generate REDO log but add dirty pages to flush list */
+ MTR_LOG_NO_REDO
+};
+
+/*
+A mini-transaction is a stream of records that is always terminated by
+a byte 0x00 or 0x01. The first byte of a mini-transaction record is
+never one of these bytes, but these bytes can occur within mini-transaction
+records.
+
+The first byte of the record would contain a record type, flags, and a
+part of length. The optional second byte of the record will contain
+more length. (Not needed for short records.)
+
+For example, because the length of an INIT_PAGE record is 3 to 11 bytes,
+the first byte will be 0x02 to 0x0a, indicating the number of subsequent bytes.
+
+Bit 7 of the first byte of a redo log record is the same_page flag.
+If same_page=1, the record is referring to the same page as the
+previous record. Records that do not refer to data pages but to file
+operations are identified by setting the same_page=1 in the very first
+record(s) of the mini-transaction. A mini-transaction record that
+carries same_page=0 must only be followed by page-oriented records.
+
+Bits 6..4 of the first byte of a redo log record identify the redo log
+type. The following record types refer to data pages:
+
+ FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE
+ INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2
+ EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t
+ WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_*
+ MEMSET (4): extends the 10.4 MLOG_MEMSET record
+ MEMMOVE (5): copy data within the page (avoids logging redundant data)
+ RESERVED (6): reserved for future use; a subtype code
+ (encoded immediately after the length) would be written
+ to reserve code space for further extensions
+ OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
+ (encoded immediately after the length) would distinguish actual usage
+
+Bits 3..0 indicate the redo log record length, excluding the first
+byte, but including additional length bytes and any other bytes,
+such as the optional tablespace identifier and page number.
+Values 1..15 represent lengths of 1 to 15 bytes. The special value 0
+indicates that 1 to 3 length bytes will follow to encode the remaining
+length that exceeds 16 bytes.
+
+Additional length bytes if length>16: 0 to 3 bytes
+0xxxxxxx for 0 to 127 (total: 16 to 143 bytes)
+10xxxxxx xxxxxxxx for 128 to 16511 (total: 144 to 16527)
+110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679)
+111xxxxx reserved (corrupted record, and file!)
+
+If same_page=0, the tablespace identifier and page number will use
+similar 1-to-5-byte variable-length encoding:
+0xxxxxxx for 0 to 127
+10xxxxxx xxxxxxxx for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx for 2,113,664 to 270,549,119
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487
+11111xxx reserved (corrupted record)
+Note: Some 5-byte values are reserved, because the tablespace identifier
+and page number can only be up to 4,294,967,295.
+
+If same_page=1 is set in a record that follows a same_page=0 record
+in a mini-transaction, the tablespace identifier and page number
+fields will be omitted.
+
+For FILE_ records (if same_page=1 for the first record
+of a mini-transaction), we will write a tablespace identifier and
+a page number (always 0) using the same 1-to-5-byte encoding.
+
+For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated
+as corrupted (or reserved for future extension). The type code must
+be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier
+and page number). If the record length does not match the encoded
+lengths of the tablespace identifier and page number, the record will
+be treated as corrupted. This allows future expansion of the format.
+
+If there is a FREE_PAGE record in a mini-transaction, it must be the
+only record for that page in the mini-transaction. If there is an
+INIT_PAGE record for a page in a mini-transaction, it must be the
+first record for that page in the mini-transaction.
+
+An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page
+identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t
+
+For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset
+on the page, relative from the previous offset. If same_page=0, the
+"previous offset" is 0. If same_page=1, the "previous offset" is where
+the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE).
+0xxxxxxx for 0 to 127
+10xxxxxx xxxxxxxx for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663
+111xxxxx reserved (corrupted record)
+If the sum of the "previous offset" and the current offset exceeds the
+page size, the record is treated as corrupted. Negative relative offsets
+cannot be written. Instead, a record with same_page=0 can be written.
+
+For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to
+3 bytes. If the length+offset exceeds the page size, the record will
+be treated as corrupted.
+
+For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes,
+relative to the current offset. The offset 0 is not possible, and
+the sign bit is the least significant bit. That is,
++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and
+-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+The source offset must be within the page size, or else the record
+will be treated as corrupted.
+
+For MEMSET or WRITE, the byte(s) to be written will follow. For
+MEMSET, it usually is a single byte, but it could also be a multi-byte
+string, which would be copied over and over until the target length is
+reached. The length of the remaining bytes is implied by the length
+bytes at the start of the record.
+
+For MEMMOVE, if any bytes follow, the record is treated as corrupted
+(future expansion).
+
+As mentioned at the start of this comment, the type byte 0 would be
+special, marking the end of a mini-transaction. We could use the
+corresponding value 0x80 (with same_page=1) for something special,
+such as a future extension when more type codes are needed, or for
+encoding rarely needed redo log records.
+
+Examples:
+
+INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2
+bytes to follow" and "tablespace ID 0x34", "page number 0x56".
+The first byte must be between 0x12 and 0x1a, and the total length of
+the record must match the lengths of the encoded tablespace ID and
+page number.
+
+WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning
+"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40",
+"page number 0x57", "byte offset 0x60", data 0x34,0x56.
+
+A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23
+0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to
+follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78.
+
+The end of the mini-transaction would be indicated by the end byte
+0x00 or 0x01; @see log_sys.get_sequence_bit().
+If log_sys.is_encrypted(), that is followed by 8 bytes of nonce
+(part of initialization vector). That will be followed by 4 bytes
+of CRC-32C of the entire mini-tranasction, excluding the end byte. */
+
+/** Redo log record types. These bit patterns (3 bits) will be written
+to the redo log file, so the existing codes or their interpretation on
+crash recovery must not be changed. */
+enum mrec_type_t
+{
+ /** Free a page. On recovery, it is unnecessary to read the page.
+ The next record for the page (if any) must be INIT_PAGE.
+ After this record has been written, the page may be
+ overwritten with zeros, or discarded or trimmed. */
+ FREE_PAGE= 0,
+ /** Zero-initialize a page. The current byte offset (for subsequent
+ records) will be reset to FIL_PAGE_TYPE. */
+ INIT_PAGE= 0x10,
+ /** Extended record; @see mrec_ext_t */
+ EXTENDED= 0x20,
+ /** Write a string of bytes. Followed by the byte offset (unsigned,
+ relative to the current byte offset, encoded in 1 to 3 bytes) and
+ the bytes to write (at least one). The current byte offset will be
+ set after the last byte written. */
+ WRITE= 0x30,
+ /** Like WRITE, but before the bytes to write, the data_length-1
+ (encoded in 1 to 3 bytes) will be encoded, and it must be more
+ than the length of the following data bytes to write.
+ The data byte(s) will be repeatedly copied to the output until
+ the data_length is reached. */
+ MEMSET= 0x40,
+ /** Like MEMSET, but instead of the bytes to write, a source byte
+ offset (signed, nonzero, relative to the target byte offset, encoded
+ in 1 to 3 bytes, with the sign bit in the least significant bit)
+ will be written.
+ That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...)
+ and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+ The source offset and data_length must be within the page size, or
+ else the record will be treated as corrupted. The data will be
+ copied from the page as it was at the start of the
+ mini-transaction. */
+ MEMMOVE= 0x50,
+ /** Reserved for future use. */
+ RESERVED= 0x60,
+ /** Optional record that may be ignored in crash recovery.
+ A subtype (@see mrec_opt) will be encoded after the page identifier. */
+ OPTION= 0x70
+};
+
+
+/** Supported EXTENDED record subtypes. */
+enum mrec_ext_t
+{
+ /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page,
+ including writing the "infimum" and "supremum" pseudo-records.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INIT_ROW_FORMAT_REDUNDANT= 0,
+ /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page,
+ including writing the "infimum" and "supremum" pseudo-records.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INIT_ROW_FORMAT_DYNAMIC= 1,
+ /** Initialize an undo log page.
+ This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ UNDO_INIT= 2,
+ /** Append a record to an undo log page.
+ This is equivalent to the old MLOG_UNDO_INSERT record.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ UNDO_APPEND= 3,
+ /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_HEAP_REDUNDANT= 4,
+ /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_REUSE_REDUNDANT= 5,
+ /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_HEAP_DYNAMIC= 6,
+ /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_REUSE_DYNAMIC= 7,
+ /** Delete a record on a ROW_FORMAT=REDUNDANT page.
+ We point to the precedessor of the record to be deleted.
+ The current byte offset will be reset to FIL_PAGE_TYPE.
+ This is similar to the old MLOG_REC_DELETE record. */
+ DELETE_ROW_FORMAT_REDUNDANT= 8,
+ /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page.
+ We point to the precedessor of the record to be deleted
+ and include the total size of the record being deleted.
+ The current byte offset will be reset to FIL_PAGE_TYPE.
+ This is similar to the old MLOG_COMP_REC_DELETE record. */
+ DELETE_ROW_FORMAT_DYNAMIC= 9,
+ /** Truncate a data file. */
+ TRIM_PAGES= 10
+};
+
+
+/** Recognized OPTION record subtypes. */
+enum mrec_opt
+{
+ /** page checksum at the end of the mini-transaction */
+ OPT_PAGE_CHECKSUM= 0
+ /* Other possible subtypes: a binlog record, or an SQL statement. */
+};
+
+
+/** Redo log record types for file-level operations. These bit
+patterns will be written to redo log files, so the existing codes or
+their interpretation on crash recovery must not be changed. */
+enum mfile_type_t
+{
+ /** Create a file. Followed by tablespace ID and the file name. */
+ FILE_CREATE = 0x80,
+ /** Delete a file. Followed by tablespace ID and the file name. */
+ FILE_DELETE = 0x90,
+ /** Rename a file. Followed by tablespace ID and the old file name,
+ NUL, and the new file name. */
+ FILE_RENAME = 0xa0,
+ /** Modify a file. Followed by tablespace ID and the file name. */
+ FILE_MODIFY = 0xb0,
+ /** End-of-checkpoint marker, at the end of a mini-transaction.
+ Followed by 2 NUL bytes of page identifier and 8 bytes of LSN;
+ @see SIZE_OF_FILE_CHECKPOINT.
+ When all bytes are NUL, this is a dummy padding record. */
+ FILE_CHECKPOINT = 0xf0
+};
+
+/** Size of a FILE_CHECKPOINT record, including the trailing byte to
+terminate the mini-transaction and the CRC-32C. */
+constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1 + 4;
+
+#ifndef UNIV_INNOCHECKSUM
+/** Types for the mlock objects to store in the mtr_t::m_memo */
+enum mtr_memo_type_t {
+ MTR_MEMO_PAGE_S_FIX = RW_S_LATCH,
+
+ MTR_MEMO_PAGE_X_FIX = RW_X_LATCH,
+
+ MTR_MEMO_PAGE_SX_FIX = RW_SX_LATCH,
+
+ MTR_MEMO_BUF_FIX = RW_NO_LATCH,
+
+ MTR_MEMO_MODIFY = 16,
+
+ MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY,
+ MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY,
+
+ MTR_MEMO_S_LOCK = RW_S_LATCH << 5,
+
+ MTR_MEMO_X_LOCK = RW_X_LATCH << 5,
+
+ MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5,
+
+ /** wr_lock() on fil_space_t::latch */
+ MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1
+};
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
new file mode 100644
index 00000000..c9db6a1f
--- /dev/null
+++ b/storage/innobase/include/os0file.h
@@ -0,0 +1,1188 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "fsp0types.h"
+#include "tpool.h"
+#include "my_counter.h"
+
+#ifndef _WIN32
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif /* !_WIN32 */
+
+extern bool os_has_said_disk_full;
+
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
+
+class buf_tmp_buffer_t;
+
+#ifdef _WIN32
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+the OS actually supports it: Win 95 does not, NT does. */
+# define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+# define UNIV_NON_BUFFERED_IO
+
+/** File handle */
+typedef native_file_handle os_file_t;
+
+
+#else /* _WIN32 */
+
+/** File handle */
+typedef int os_file_t;
+
+#endif /* _WIN32 */
+
+static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1);
+
+/** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */
+struct pfs_os_file_t
+{
+ /** Default constructor */
+ pfs_os_file_t(os_file_t file = OS_FILE_CLOSED) : m_file(file)
+#ifdef UNIV_PFS_IO
+ , m_psi(NULL)
+#endif
+ {}
+
+ /** The wrapped file handle */
+ os_file_t m_file;
+#ifdef UNIV_PFS_IO
+ /** PERFORMANCE_SCHEMA descriptor */
+ struct PSI_file *m_psi;
+#endif
+ /** Implicit type conversion.
+ @return the wrapped file handle */
+ operator os_file_t() const { return m_file; }
+ /** Assignment operator.
+ @param[in] file file handle to be assigned */
+ void operator=(os_file_t file) { m_file = file; }
+ bool operator==(os_file_t file) const { return m_file == file; }
+ bool operator!=(os_file_t file) const { return !(*this == file); }
+#ifndef DBUG_OFF
+ friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){
+ os << os_file_t(f);
+ return os;
+ }
+#endif
+};
+
+/** Options for os_file_create_func @{ */
+enum os_file_create_t {
+ OS_FILE_OPEN = 51, /*!< to open an existing file (if
+ doesn't exist, error) */
+ OS_FILE_CREATE, /*!< to create new file (if
+ exists, error) */
+ OS_FILE_OVERWRITE, /*!< to create a new file, if exists
+ the overwrite old file */
+ OS_FILE_OPEN_RAW, /*!< to open a raw device or disk
+ partition */
+ OS_FILE_CREATE_PATH, /*!< to create the directories */
+ OS_FILE_OPEN_RETRY, /*!< open with retry */
+
+ /** Flags that can be combined with the above values. Please ensure
+ that the above values stay below 128. */
+
+ OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */
+ OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to
+ the log unless it is a fatal error,
+ this flag is only used if
+ ON_ERROR_NO_EXIT is set */
+};
+
+static const ulint OS_FILE_READ_ONLY = 333;
+static const ulint OS_FILE_READ_WRITE = 444;
+
+/** Used by MySQLBackup */
+static const ulint OS_FILE_READ_ALLOW_DELETE = 555;
+
+/* Options for file_create */
+static const ulint OS_FILE_AIO = 61;
+static const ulint OS_FILE_NORMAL = 62;
+/* @} */
+
+/** Types for file create @{ */
+static const ulint OS_DATA_FILE = 100;
+static const ulint OS_LOG_FILE = 101;
+static const ulint OS_DATA_FILE_NO_O_DIRECT = 103;
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+static const ulint OS_FILE_NAME_TOO_LONG = 36;
+static const ulint OS_FILE_NOT_FOUND = 71;
+static const ulint OS_FILE_DISK_FULL = 72;
+static const ulint OS_FILE_ALREADY_EXISTS = 73;
+static const ulint OS_FILE_PATH_ERROR = 74;
+
+/** wait for OS aio resources to become available again */
+static const ulint OS_FILE_AIO_RESOURCES_RESERVED = 75;
+
+static const ulint OS_FILE_SHARING_VIOLATION = 76;
+static const ulint OS_FILE_ERROR_NOT_SPECIFIED = 77;
+static const ulint OS_FILE_INSUFFICIENT_RESOURCE = 78;
+static const ulint OS_FILE_AIO_INTERRUPTED = 79;
+static const ulint OS_FILE_OPERATION_ABORTED = 80;
+static const ulint OS_FILE_ACCESS_VIOLATION = 81;
+static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125;
+static const ulint OS_FILE_ERROR_MAX = 200;
+/* @} */
+
+/**
+The I/O context that is passed down to the low level IO code */
+class IORequest
+{
+public:
+ enum Type
+ {
+ /** Synchronous read */
+ READ_SYNC= 2,
+ /** Asynchronous read; some errors will be ignored */
+ READ_ASYNC= READ_SYNC | 1,
+ /** Possibly partial read; only used with
+ os_file_read_no_error_handling() */
+ READ_MAYBE_PARTIAL= READ_SYNC | 4,
+ /** Read for doublewrite buffer recovery */
+ DBLWR_RECOVER= READ_SYNC | 8,
+ /** Synchronous write */
+ WRITE_SYNC= 16,
+ /** Asynchronous write */
+ WRITE_ASYNC= WRITE_SYNC | 1,
+ /** A doublewrite batch */
+ DBLWR_BATCH= WRITE_ASYNC | 8,
+ /** Write data; evict the block on write completion */
+ WRITE_LRU= WRITE_ASYNC | 32,
+ /** Write data and punch hole for the rest */
+ PUNCH= WRITE_ASYNC | 64,
+ /** Write data and punch hole; evict the block on write completion */
+ PUNCH_LRU= PUNCH | WRITE_LRU,
+ /** Zero out a range of bytes in fil_space_t::io() */
+ PUNCH_RANGE= WRITE_SYNC | 128,
+ };
+
+ constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot,
+ fil_node_t *node, Type type) :
+ bpage(bpage), slot(slot), node(node), type(type) {}
+
+ constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr,
+ buf_tmp_buffer_t *slot= nullptr) :
+ bpage(bpage), slot(slot), type(type) {}
+
+ bool is_read() const { return (type & READ_SYNC) != 0; }
+ bool is_write() const { return (type & WRITE_SYNC) != 0; }
+ bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
+ bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
+
+ void write_complete(int io_error) const;
+ void read_complete(int io_error) const;
+ void fake_read_complete(os_offset_t offset) const;
+
+ /** If requested, free storage space associated with a section of the file.
+ @param off byte offset from the start (SEEK_SET)
+ @param len size of the hole in bytes
+ @return DB_SUCCESS or error code */
+ dberr_t maybe_punch_hole(os_offset_t off, ulint len)
+ {
+ return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
+ ? punch_hole(off, len)
+ : DB_SUCCESS;
+ }
+
+private:
+ /** Free storage space associated with a section of the file.
+ @param off byte offset from the start (SEEK_SET)
+ @param len size of the hole in bytes
+ @return DB_SUCCESS or error code */
+ dberr_t punch_hole(os_offset_t off, ulint len) const;
+
+public:
+ /** Page to be written on write operation */
+ buf_page_t *const bpage= nullptr;
+
+ /** Memory to be used for encrypted or page_compressed pages */
+ buf_tmp_buffer_t *const slot= nullptr;
+
+ /** File descriptor */
+ fil_node_t *const node= nullptr;
+
+ /** Request type bit flags */
+ const Type type;
+};
+
+constexpr IORequest IORequestRead(IORequest::READ_SYNC);
+constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
+constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);
+
+/** Sparse file size information. */
+struct os_file_size_t {
+ /** Total size of file in bytes */
+ os_offset_t m_total_size;
+
+ /** If it is a sparse file then this is the number of bytes
+ actually allocated for the file. */
+ os_offset_t m_alloc_size;
+};
+
+constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
+
+extern Atomic_counter<ulint> os_n_file_reads;
+extern Atomic_counter<size_t> os_n_file_writes;
+extern Atomic_counter<size_t> os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_t {
+ OS_FILE_TYPE_UNKNOWN = 0,
+ OS_FILE_TYPE_FILE, /* regular file */
+ OS_FILE_TYPE_DIR, /* directory */
+ OS_FILE_TYPE_LINK, /* symbolic link */
+ OS_FILE_TYPE_BLOCK /* block device */
+};
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes. The maximum path length used by any storage engine
+in the server must be at least this big. */
+
+/* MySQL 5.7 my_global.h */
+#ifndef FN_REFLEN_SE
+#define FN_REFLEN_SE 4000
+#endif
+
+#define OS_FILE_MAX_PATH 4000
+#if (FN_REFLEN_SE < OS_FILE_MAX_PATH)
+# error "(FN_REFLEN_SE < OS_FILE_MAX_PATH)"
+#endif
+
+/** Struct used in fetching information of a file in a directory */
+struct os_file_stat_t {
+ char name[OS_FILE_MAX_PATH]; /*!< path to a file */
+ os_file_type_t type; /*!< file type */
+ os_offset_t size; /*!< file size in bytes */
+ os_offset_t alloc_size; /*!< Allocated size for
+ sparse files in bytes */
+ size_t block_size; /*!< Block size to use for IO
+ in bytes*/
+ time_t ctime; /*!< creation time */
+ time_t mtime; /*!< modification time */
+ time_t atime; /*!< access time */
+ bool rw_perm; /*!< true if can be opened
+ in read-write mode. Only valid
+ if type == OS_FILE_TYPE_FILE */
+};
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile();
+
+/**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+
+@param[in] pathname directory name as null-terminated string
+@param[in] fail_if_exists if true, pre-existing directory is treated
+ as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+ const char* pathname,
+ bool fail_if_exists);
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeed, false if error
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success);
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option
+ is used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef _WIN32
+#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0)
+#else
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in] fd file descriptor to alter
+@param[in] file_name file name, used in the diagnostic message
+@param[in] name "open" or "create"; used in the diagnostic
+ message */
+void
+os_file_set_nocache(
+/*================*/
+ int fd, /*!< in: file descriptor to alter */
+ const char* file_name,
+ const char* operation_name);
+#endif
+
+#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
+/** Obtain an exclusive lock on a file.
+@param fd file descriptor
+@param name file name
+@return 0 on success */
+int os_file_lock(int fd, const char *name);
+#endif
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async I/O or unbuffered I/O: look in the
+ function source code for the exact rules
+@param[in] type OS_DATA_FILE or OS_LOG_FILE
+@param[in] read_only if true read only mode checks are enforced
+@param[in] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(const char* name);
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@param[out] exist indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(const char* name, bool* exist);
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly
+this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@return true if success */
+bool
+os_file_rename_func(const char* oldpath, const char* newpath);
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in] file own: handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file);
+
+#ifdef UNIV_PFS_IO
+
+/* Keys to register InnoDB I/O with performance schema */
+extern mysql_pfs_key_t innodb_data_file_key;
+extern mysql_pfs_key_t innodb_temp_file_key;
+
+/* Following four macros are instumentations to register
+various file I/O operations with performance schema.
+1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
+used to register file creation, opening, closing and renaming.
+2) register_pfs_file_rename_begin() and register_pfs_file_rename_end()
+are used to register file renaming
+2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
+used to register actual file read, write and flush
+3) register_pfs_file_close_begin() and register_pfs_file_close_end()
+are used to register file deletion operations*/
+# define register_pfs_file_open_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+do { \
+ locker = PSI_FILE_CALL(get_thread_file_name_locker)( \
+ state, key, op, name, &locker); \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(start_file_open_wait)( \
+ locker, src_file, src_line); \
+ } \
+} while (0)
+
+# define register_pfs_file_open_end(locker, file, result) \
+do { \
+ if (locker != NULL) { \
+ file.m_psi = PSI_FILE_CALL(end_file_open_wait)( \
+ locker, result); \
+ } \
+} while (0)
+
+# define register_pfs_file_rename_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+ register_pfs_file_open_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+
+# define register_pfs_file_rename_end(locker, from, to, result) \
+do { \
+ if (locker != NULL) { \
+ PSI_FILE_CALL( \
+ end_file_rename_wait)( \
+ locker, from, to, result); \
+ } \
+} while (0)
+
+# define register_pfs_file_close_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+do { \
+ locker = PSI_FILE_CALL(get_thread_file_name_locker)( \
+ state, key, op, name, &locker); \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(start_file_close_wait)( \
+ locker, src_file, src_line); \
+ } \
+} while (0)
+
+# define register_pfs_file_close_end(locker, result) \
+do { \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(end_file_close_wait)( \
+ locker, result); \
+ } \
+} while (0)
+
+# define register_pfs_file_io_begin(state, locker, file, count, op, \
+ src_file, src_line) \
+do { \
+ locker = PSI_FILE_CALL(get_thread_file_stream_locker)( \
+ state, file.m_psi, op); \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(start_file_wait)( \
+ locker, count, src_file, src_line); \
+ } \
+} while (0)
+
+# define register_pfs_file_io_end(locker, count) \
+do { \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(end_file_wait)(locker, count); \
+ } \
+} while (0)
+
+/* Following macros/functions are file I/O APIs that would be performance
+schema instrumented if "UNIV_PFS_IO" is defined. They would point to
+wrapper functions with performance schema instrumentation in such case.
+
+os_file_create
+os_file_create_simple
+os_file_create_simple_no_error_handling
+os_file_close
+os_file_rename
+os_aio
+os_file_read
+os_file_read_no_error_handling
+os_file_write
+
+The wrapper functions have the prefix of "innodb_". */
+
+# define os_file_create(key, name, create, purpose, type, read_only, \
+ success) \
+ pfs_os_file_create_func(key, name, create, purpose, type, \
+ read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple(key, name, create, access, \
+ read_only, success) \
+ pfs_os_file_create_simple_func(key, name, create, access, \
+ read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple_no_error_handling( \
+ key, name, create_mode, access, read_only, success) \
+ pfs_os_file_create_simple_no_error_handling_func( \
+ key, name, create_mode, access, \
+ read_only, success, __FILE__, __LINE__)
+
+# define os_file_close(file) \
+ pfs_os_file_close_func(file, __FILE__, __LINE__)
+
+# define os_file_read(type, file, buf, offset, n, o) \
+ pfs_os_file_read_func(type, file, buf, offset, n,o, __FILE__, __LINE__)
+
+# define os_file_write(type, name, file, buf, offset, n) \
+ pfs_os_file_write_func(type, name, file, buf, offset, \
+ n, __FILE__, __LINE__)
+
+# define os_file_flush(file) \
+ pfs_os_file_flush_func(file, __FILE__, __LINE__)
+
+# define os_file_rename(key, oldpath, newpath) \
+ pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+
+# define os_file_delete(key, name) \
+ pfs_os_file_delete_func(key, name, __FILE__, __LINE__)
+
+# define os_file_delete_if_exists(key, name, exist) \
+ pfs_os_file_delete_if_exists_func(key, name, exist, __FILE__, __LINE__)
+
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async I/O or unbuffered I/O: look in the
+ function source code for the exact rules
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in] file handle to a file
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+ const IORequest& type,
+ pfs_os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in] type IO request context
+@param[in] name Name of the file or path as NUL terminated
+ string
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ pfs_os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in] file Open file handle
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line);
+
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in] key Performance Schema Key
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+ mysql_pfs_key_t key,
+ const char* oldpath,
+ const char* newpath,
+ const char* src_file,
+ uint src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ const char* src_file,
+ uint src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] exist indicate if file pre-exist
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ bool* exist,
+ const char* src_file,
+ uint src_line);
+
+#else /* UNIV_PFS_IO */
+
+/* If UNIV_PFS_IO is not defined, these I/O APIs point
+to original un-instrumented file I/O APIs */
+# define os_file_create(key, name, create, purpose, type, read_only, \
+ success) \
+ os_file_create_func(name, create, purpose, type, read_only, \
+ success)
+
+# define os_file_create_simple(key, name, create_mode, access, \
+ read_only, success) \
+ os_file_create_simple_func(name, create_mode, access, \
+ read_only, success)
+
+# define os_file_create_simple_no_error_handling( \
+ key, name, create_mode, access, read_only, success) \
+ os_file_create_simple_no_error_handling_func( \
+ name, create_mode, access, read_only, success)
+
+# define os_file_close(file) os_file_close_func(file)
+
+# define os_file_read(type, file, buf, offset, n, o) \
+ os_file_read_func(type, file, buf, offset, n, o)
+
+# define os_file_write(type, name, file, buf, offset, n) \
+ os_file_write_func(type, name, file, buf, offset, n)
+
+# define os_file_flush(file) os_file_flush_func(file)
+
+# define os_file_rename(key, oldpath, newpath) \
+ os_file_rename_func(oldpath, newpath)
+
+# define os_file_delete(key, name) os_file_delete_func(name)
+
+# define os_file_delete_if_exists(key, name, exist) \
+ os_file_delete_if_exists_func(name, exist)
+
+#endif /* UNIV_PFS_IO */
+
+/** Gets a file size.
+@param[in] file handle to a file
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size
+ to errno */
+os_file_size_t
+os_file_get_size(
+ const char* filename)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets a file size.
+@param[in] file handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(
+ os_file_t file)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in] name file name
+@param[in] file file handle
+@param[in] size desired file size
+@param[in] sparse whether to create a sparse file (no preallocating)
+@return whether the operation succeeded */
+bool
+os_file_set_size(
+ const char* name,
+ os_file_t file,
+ os_offset_t size,
+ bool is_sparse = false)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Truncates a file at its current position.
+@param[in/out] file file to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+ FILE* file); /*!< in: file to be truncated */
+
+/** Truncate a file to a specified size in bytes.
+@param[in] pathname file path
+@param[in] file file to be truncated
+@param[in] size size preserved in bytes
+@param[in] allow_shrink whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size,
+ bool allow_shrink = false);
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in] file handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+ os_file_t file);
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + OS_FILE_ERROR_MAX is returned.
+@param[in] report_all_errors true if we want an error message
+ printed of all errors
+@param[in] on_error_silent true then don't print any diagnostic
+ to the log
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
+ulint os_file_get_last_error(bool report_all_errors,
+ bool on_error_silent= false);
+
+/** NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_read_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out] file file to read from
+@param[in,out] str buffer where to read
+@param[in] size size of buffer */
+void
+os_file_read_string(
+ FILE* file,
+ char* str,
+ ulint size);
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly this
+function!
+Requests a synchronous write operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Check the existence and type of the given file.
+@param[in] path pathname of the file
+@param[out] exists true if file exists
+@param[out] type type of the file (if it exists)
+@return true if call succeeded */
+bool
+os_file_status(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type);
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return. The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out] data_dir_path Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+ char* data_dir_path);
+
+/** Create all missing subdirectories along the given path.
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+ const char* path);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir();
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+/**
+Initializes the asynchronous io system. */
+int os_aio_init();
+
+/**
+Frees the asynchronous io system. */
+void os_aio_free();
+
+/** Submit a fake read request during crash recovery.
+@param type fake read request
+@param offset additional context */
+void os_fake_read(const IORequest &type, os_offset_t offset);
+
+/** Request a read or write.
+@param type I/O request
+@param buf buffer
+@param offset file offset
+@param n number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
+
+/** @return number of pending reads */
+size_t os_aio_pending_reads();
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx();
+/** @return number of pending writes */
+size_t os_aio_pending_writes();
+
+/** Wait until there are no pending asynchronous writes.
+@param declare whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_writes(bool declare);
+
+/** Wait until all pending asynchronous reads have completed.
+@param declare whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_reads(bool declare);
+
+/** Prints info of the aio arrays.
+@param[in/out] file file where to print */
+void
+os_aio_print(FILE* file);
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats();
+
+/** Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+bool
+os_aio_all_slots_free();
+
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[in] stat_info information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+ const char* path,
+ os_file_stat_t* stat_info,
+ bool check_rw_perm,
+ bool read_only);
+
+/** Set the file create umask
+@param[in] umask The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask);
+
+#ifdef _WIN32
+
+/**
+Make file sparse, on Windows.
+
+@param[in] file file handle
+@param[in] is_sparse if true, make file sparse,
+ otherwise "unsparse" the file
+@return true on success, false on error */
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse = true);
+
+/**
+Changes file size on Windows
+
+If file is extended, following happens the bytes between
+old and new EOF are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in] pathname file path
+@param[in] file file handle
+@param[in] size size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size);
+
+#endif /*_WIN32 */
+
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/* Determine if a path is an absolute path or not.
+@param[in] OS directory or file path to evaluate
+@retval true if an absolute path
+@retval false if a relative path */
+inline bool is_absolute_path(const char *path)
+{
+ switch (path[0]) {
+#ifdef _WIN32
+ case '\0':
+ return false;
+ case '\\':
+#endif
+ case '/':
+ return true;
+ }
+
+#ifdef _WIN32
+ if (path[1] == ':')
+ {
+ switch (path[2]) {
+ case '/':
+ case '\\':
+ return true;
+ }
+ }
+#endif /* _WIN32 */
+
+ return false;
+}
+
+#include "os0file.inl"
+
+#endif /* os0file_h */
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
new file mode 100644
index 00000000..7de31505
--- /dev/null
+++ b/storage/innobase/include/os0file.inl
@@ -0,0 +1,412 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0file.ic
+The interface to the operating system file io
+
+Created 2/20/2010 Jimmy Yang
+*******************************************************/
+
+#ifdef UNIV_PFS_IO
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(
+ &state, locker, key,
+ (create_mode == OS_FILE_CREATE)
+ ? PSI_FILE_CREATE : PSI_FILE_OPEN,
+ name, src_file, src_line);
+
+ pfs_os_file_t file = os_file_create_simple_func(
+ name, create_mode, access_type, read_only, success);
+
+ /* Register psi value for the file */
+ register_pfs_file_open_end(locker, file,
+ (*success == TRUE ? success : 0));
+
+ return(file);
+}
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(
+ &state, locker, key,
+ create_mode == OS_FILE_CREATE
+ ? PSI_FILE_CREATE : PSI_FILE_OPEN,
+ name, src_file, src_line);
+
+ pfs_os_file_t file = os_file_create_simple_no_error_handling_func(
+ name, create_mode, access_type, read_only, success);
+
+ register_pfs_file_open_end(locker, file,
+ (*success == TRUE ? success : 0));
+
+ return(file);
+}
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really us
+ async I/O or unbuffered I/O: look in the
+ function source code for the exact rules
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(
+ &state, locker, key,
+ create_mode == OS_FILE_CREATE
+ ? PSI_FILE_CREATE : PSI_FILE_OPEN,
+ name, src_file, src_line);
+
+ pfs_os_file_t file = os_file_create_func(
+ name, create_mode, purpose, type, read_only, success);
+
+ register_pfs_file_open_end(locker, file,
+ (*success == TRUE ? success : 0));
+
+ return(file);
+}
+/**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in] file handle to a file
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register the file close */
+ register_pfs_file_io_begin(
+ &state, locker, file, 0, PSI_FILE_CLOSE, src_file, src_line);
+
+ bool result = os_file_close_func(file);
+
+ register_pfs_file_io_end(locker, 0);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+ const IORequest& type,
+ pfs_os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, n, PSI_FILE_READ, src_file, src_line);
+
+ dberr_t result;
+
+ result = os_file_read_func(type, file, buf, offset, n, o);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in] type IO request context
+@param[in] name Name of the file or path as NUL terminated
+ string
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return error code
+@retval DB_SUCCESS if the request was successfully fulfilled */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ pfs_os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, n, PSI_FILE_WRITE, src_file, src_line);
+
+ dberr_t result;
+
+ result = os_file_write_func(type, name, file, buf, offset, n);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in] file Open file handle
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line);
+
+ bool result = os_file_flush_func(file);
+
+ register_pfs_file_io_end(locker, 0);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in] key Performance Schema Key
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+ mysql_pfs_key_t key,
+ const char* oldpath,
+ const char* newpath,
+ const char* src_file,
+ uint src_line)
+
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_rename_begin(
+ &state, locker, key, PSI_FILE_RENAME, newpath,
+ src_file, src_line);
+
+ bool result = os_file_rename_func(oldpath, newpath);
+
+ register_pfs_file_rename_end(locker, oldpath, newpath, !result);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_close_begin(
+ &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+ bool result = os_file_delete_func(name);
+
+ register_pfs_file_close_end(locker, 0);
+
+ return(result);
+}
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] exist indicate if file pre-exist
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ bool* exist,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_close_begin(
+ &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+ bool result = os_file_delete_if_exists_func(name, exist);
+
+ register_pfs_file_close_end(locker, 0);
+
+ return(result);
+}
+#endif /* UNIV_PFS_IO */
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
new file mode 100644
index 00000000..28aa3056
--- /dev/null
+++ b/storage/innobase/include/page0cur.h
@@ -0,0 +1,303 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "page0page.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur); /*!< in: page cursor */
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
+@return record */
+UNIV_INLINE
+rec_t *page_cur_get_rec(const page_cur_t *cur);
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur) page_align((cur)->rec)
+# define page_cur_get_block(cur) (cur)->block
+# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur) (cur)->rec
+#endif /* UNIV_DEBUG */
+# define is_page_cur_get_page_zip(cur) is_buf_block_get_page_zip((cur)->block)
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur); /*!< out: page cursor */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+@return pointer to record
+@retval nullptr if not enough space was available */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ const page_cur_t*cur, /*!< in: page cursor */
+ const rec_t* rec, /*!< in: record to insert after cur */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record
+@return nullptr on failure */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ page_cur_t* cursor, /*!< in/out: page cursor,
+ logical position unchanged */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(
+ cursor->rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr encoded fixed-size header bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+ ulint prev, ulint enc_hdr,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len);
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l number of copied record header bytes, plus record type bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+ ulint prev, ulint shift, ulint enc_hdr_l,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len);
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev);
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+ size_t hdr_size, size_t data_size);
+
+MY_ATTRIBUTE((warn_unused_result))
+/****************************************************************//**
+Searches the right position for a page cursor. */
+bool
+page_cur_search_with_match(
+/*=======================*/
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ rtr_info_t* rtr_info);/*!< in/out: rtree search stack */
+#ifdef BTR_CUR_HASH_ADAPT
+MY_ATTRIBUTE((warn_unused_result))
+/** Search the right position for a page cursor.
+@param[in] tuple key to be searched for
+@param[in] mode search mode
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] iup_matched_bytes already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[in,out] ilow_matched_bytes already matched bytes in the
+first partially matched field in the lower limit record
+@param[in,out] cursor page cursor */
+bool
+page_cur_search_with_match_bytes(
+ const dtuple_t* tuple,
+ page_cur_mode_t mode,
+ ulint* iup_matched_fields,
+ ulint* iup_matched_bytes,
+ ulint* ilow_matched_fields,
+ ulint* ilow_matched_bytes,
+ page_cur_t* cursor);
+#endif /* BTR_CUR_HASH_ADAPT */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor);
+
+/** Index page cursor */
+
+struct page_cur_t{
+ dict_index_t* index;
+ rec_t* rec; /*!< pointer to a record on page */
+ rec_offs* offsets;
+ buf_block_t* block; /*!< pointer to the block containing rec */
+};
+
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_next(page_cur_t *cur)
+{
+ return cur->rec= page_rec_get_next(cur->rec);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_prev(page_cur_t *cur)
+{
+ return cur->rec= page_rec_get_prev(cur->rec);
+}
+
+#include "page0cur.inl"
+
+#endif
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
new file mode 100644
index 00000000..7c4eafa2
--- /dev/null
+++ b/storage/innobase/include/page0cur.inl
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ return page_align(page_cur_get_rec(cur));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+ return cur->block;
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
+@return record */
+UNIV_INLINE
+rec_t *page_cur_get_rec(const page_cur_t *cur)
+{
+ ut_ad(cur);
+ ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+ return cur->rec;
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = const_cast<buf_block_t*>(block);
+ cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = const_cast<buf_block_t*>(block);
+ cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->page.frame);
+ return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->page.frame);
+ return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur) /*!< out: page cursor */
+{
+ ut_ad(rec && block && cur);
+ ut_ad(page_align(rec) == block->page.frame);
+
+ cur->rec = (rec_t*) rec;
+ cur->block = (buf_block_t*) block;
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint size = rec_get_converted_size(cursor->index, tuple, n_ext);
+
+ if (!*heap) {
+ *heap = mem_heap_create(size
+ + (4 + REC_OFFS_HEADER_SIZE
+ + dtuple_get_n_fields(tuple))
+ * sizeof **offsets);
+ }
+
+ rec_t* rec = rec_convert_dtuple_to_rec(
+ static_cast<byte*>(mem_heap_alloc(*heap, size)),
+ cursor->index, tuple, n_ext);
+
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets,
+ page_is_leaf(cursor->block->page.frame)
+ ? cursor->index->n_core_fields : 0,
+ ULINT_UNDEFINED, heap);
+ ut_ad(size == rec_offs_size(*offsets));
+
+ if (is_buf_block_get_page_zip(cursor->block)) {
+ rec = page_cur_insert_rec_zip(cursor, rec, *offsets, mtr);
+ } else {
+ rec = page_cur_insert_rec_low(cursor, rec, *offsets, mtr);
+ }
+
+ ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, cursor->index, *offsets));
+ return(rec);
+}
+
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
new file mode 100644
index 00000000..2978656b
--- /dev/null
+++ b/storage/innobase/include/page0page.h
@@ -0,0 +1,1101 @@
+/*****************************************************************************
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "page0types.h"
+#include "fsp0fsp.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "rem0rec.h"
+#include "mach0data.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+/* PAGE HEADER
+ ===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef byte page_header_t;
+#endif /* !UNIV_INNOCHECKSUM */
+
+#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this
+ offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */
+#define PAGE_HEAP_TOP 2 /* pointer to record heap top */
+#define PAGE_N_HEAP 4 /* number of records in the heap,
+ bit 15=flag: new-style compact page format */
+#define PAGE_FREE 6 /* pointer to start of page free record list */
+#define PAGE_GARBAGE 8 /* number of bytes in deleted records */
+#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or
+ 0 if this info has been reset by a delete,
+ for example */
+
+/** This 10-bit field is usually 0. In B-tree index pages of
+ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd
+file was created in MySQL 4.1.0 or if the table resides in the system
+tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14.
+In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX.
+
+In ROW_FORMAT=COMPRESSED tables, this field is always 0, because
+instant ADD COLUMN is not supported.
+
+In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is
+always 0, except in the root page of the clustered index after instant
+ADD COLUMN.
+
+Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT
+and initialize the PAGE_INSTANT field to the original number of
+fields in the clustered index (dict_index_t::n_core_fields). The most
+significant bits are in the first byte, and the least significant 5
+bits are stored in the most significant 5 bits of PAGE_DIRECTION_B.
+
+These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if
+instant ADD COLUMN was not committed. Changes to these page header fields
+are not undo-logged, but changes to the hidden metadata record are.
+If the server is killed and restarted, the page header fields could
+remain set even though no metadata record is present.
+
+When the table becomes empty, the PAGE_INSTANT field and the
+FIL_PAGE_TYPE can be reset and any metadata record be removed. */
+#define PAGE_INSTANT 12
+
+/** last insert direction: PAGE_LEFT, ....
+In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14,
+this byte can be garbage. */
+#define PAGE_DIRECTION_B 13
+#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same
+ direction */
+#define PAGE_N_RECS 16 /* number of user records on the page */
+/** The largest DB_TRX_ID that may have modified a record on the page;
+Defined only in secondary index leaf pages and in change buffer leaf pages.
+Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */
+#define PAGE_MAX_TRX_ID 18
+/** The AUTO_INCREMENT value (on persistent clustered index root pages). */
+#define PAGE_ROOT_AUTO_INC PAGE_MAX_TRX_ID
+#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page
+ header which are set in a page create */
+/*----*/
+#define PAGE_LEVEL 26 /* level of the node in an index tree; the
+ leaf level is the level 0. This field should
+ not be written to after page creation. */
+#define PAGE_INDEX_ID 28 /* index id where the page belongs.
+ This field should not be written to after
+ page creation. */
+
+#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in
+ a B-tree: defined only on the root page of a
+ B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+ /* in the place of PAGE_BTR_SEG_LEAF and _TOP
+ there is a free list base node if the page is
+ the root page of an ibuf tree, and at the same
+ place is the free list node if the page is in
+ a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+ /* file segment header for the non-leaf pages
+ in a B-tree: defined only on the root page of
+ a B-tree, but not in the root of an ibuf
+ tree */
+/*----*/
+#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+ /* start of data on the page */
+
+#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+ /* offset of the page infimum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+ /* offset of the page supremum record end on
+ an old-style page */
+#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+ /* offset of the page infimum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+ /* offset of the page supremum record end on
+ a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM 0U /* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM 1U /* page supremum */
+#define PAGE_HEAP_NO_USER_LOW 2U /* first user record in
+ creation (insertion) order,
+ not necessarily collation order;
+ this record may have been deleted */
+
+/* Directions of cursor movement (stored in PAGE_DIRECTION field) */
+constexpr uint16_t PAGE_LEFT= 1;
+constexpr uint16_t PAGE_RIGHT= 2;
+constexpr uint16_t PAGE_SAME_REC= 3;
+constexpr uint16_t PAGE_SAME_PAGE= 4;
+constexpr uint16_t PAGE_NO_DIRECTION= 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* PAGE DIRECTORY
+ ==============
+*/
+
+typedef byte page_dir_slot_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define PAGE_DIR FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2;
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED 8
+#define PAGE_DIR_SLOT_MIN_N_OWNED 4
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+#endif /* UNIV_INNOCHECKSUM */
+
+/** Get the start of a page frame.
+@param[in] ptr pointer within a page frame
+@return start of the page frame */
+MY_ATTRIBUTE((const))
+inline page_t* page_align(void *ptr)
+{
+ return my_assume_aligned<UNIV_PAGE_SIZE_MIN>
+ (reinterpret_cast<page_t*>(ut_align_down(ptr, srv_page_size)));
+}
+inline const page_t *page_align(const void *ptr)
+{
+ return page_align(const_cast<void*>(ptr));
+}
+
+/** Gets the byte offset within a page frame.
+@param[in] ptr pointer within a page frame
+@return offset from the start of the page */
+MY_ATTRIBUTE((const))
+inline uint16_t page_offset(const void* ptr)
+{
+ return static_cast<uint16_t>(ut_align_offset(ptr, srv_page_size));
+}
+
+/** Determine whether an index page is not in ROW_FORMAT=REDUNDANT.
+@param[in] page index page
+@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval 0 if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_is_comp(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return(page[PAGE_HEADER + PAGE_N_HEAP] & 0x80);
+}
+
+/** Determine whether an index page is empty.
+@param[in] page index page
+@return whether the page is empty (PAGE_N_RECS = 0) */
+inline
+bool
+page_is_empty(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_N_RECS
+ + page);
+}
+
+/** Determine whether an index page contains garbage.
+@param[in] page index page
+@return whether the page contains garbage (PAGE_GARBAGE is not 0) */
+inline
+bool
+page_has_garbage(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return *reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_GARBAGE
+ + page);
+}
+
+/** Determine whether an B-tree or R-tree index page is a leaf page.
+@param[in] page index page
+@return true if the page is a leaf (PAGE_LEVEL = 0) */
+inline
+bool
+page_is_leaf(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_LEVEL
+ + page);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Determine whether an index page record is not in ROW_FORMAT=REDUNDANT.
+@param[in] rec record in an index page frame (not a copy)
+@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval 0 if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_rec_is_comp(const byte* rec)
+{
+ return(page_is_comp(page_align(rec)));
+}
+
+# ifdef UNIV_DEBUG
+/** Determine if the record is the metadata pseudo-record
+in the clustered index.
+@param[in] rec leaf page record on an index page
+@return whether the record is the metadata pseudo-record */
+inline bool page_rec_is_metadata(const rec_t* rec)
+{
+ return rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG;
+}
+# endif /* UNIV_DEBUG */
+
+/** Determine the offset of the infimum record on the page.
+@param[in] page index page
+@return offset of the infimum record in record list, relative from page */
+inline
+unsigned
+page_get_infimum_offset(const page_t* page)
+{
+ ut_ad(!page_offset(page));
+ return page_is_comp(page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM;
+}
+
+/** Determine the offset of the supremum record on the page.
+@param[in] page index page
+@return offset of the supremum record in record list, relative from page */
+inline
+unsigned
+page_get_supremum_offset(const page_t* page)
+{
+ ut_ad(!page_offset(page));
+ return page_is_comp(page) ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in] offset record offset in the page
+@retval true if a user record
+@retval false if the infimum or supremum pseudo-record */
+inline
+bool
+page_rec_is_user_rec_low(ulint offset)
+{
+ compile_time_assert(PAGE_OLD_INFIMUM >= PAGE_NEW_INFIMUM);
+ compile_time_assert(PAGE_OLD_SUPREMUM >= PAGE_NEW_SUPREMUM);
+ compile_time_assert(PAGE_NEW_INFIMUM < PAGE_OLD_SUPREMUM);
+ compile_time_assert(PAGE_OLD_INFIMUM < PAGE_NEW_SUPREMUM);
+ compile_time_assert(PAGE_NEW_SUPREMUM < PAGE_OLD_SUPREMUM_END);
+ compile_time_assert(PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM_END);
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+
+ return(offset != PAGE_NEW_SUPREMUM
+ && offset != PAGE_NEW_INFIMUM
+ && offset != PAGE_OLD_INFIMUM
+ && offset != PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the supremum record on an index page.
+@param[in] offset record offset in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum_low(ulint offset)
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+ return(offset == PAGE_NEW_SUPREMUM || offset == PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the infimum record on an index page.
+@param[in] offset record offset in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum_low(ulint offset)
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+ return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
+}
+
+/** Determine whether an B-tree or R-tree index record is in a leaf page.
+@param[in] rec index record in an index page
+@return true if the record is in a leaf page */
+inline
+bool
+page_rec_is_leaf(const page_t* rec)
+{
+ const page_t* page = page_align(rec);
+ ut_ad(ulint(rec - page) >= page_get_infimum_offset(page));
+ bool leaf = page_is_leaf(page);
+ ut_ad(!page_rec_is_comp(rec)
+ || !page_rec_is_user_rec_low(ulint(rec - page))
+ || leaf == !rec_get_node_ptr_flag(rec));
+ return leaf;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in] rec record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec);
+
+/** Determine whether an index page record is the supremum record.
+@param[in] rec record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec);
+
+/** Determine whether an index page record is the infimum record.
+@param[in] rec record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec);
+
+/** Read PAGE_MAX_TRX_ID.
+@param[in] page index page
+@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline trx_id_t page_get_max_trx_id(const page_t *page)
+{
+ ut_ad(fil_page_index_page_check(page));
+ static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+ const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID);
+ return mach_read_from_8(p);
+}
+
+/**
+Set the number of owned records.
+@tparam compressed whether to update any ROW_FORMAT=COMPRESSED page as well
+@param[in,out] block index page
+@param[in,out] rec record in block.frame
+@param[in] n_owned number of records skipped in the sparse page directory
+@param[in] comp whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@param[in,out] mtr mini-transaction */
+template<bool compressed>
+inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
+ bool comp, mtr_t *mtr)
+{
+ ut_ad(block->page.frame == page_align(rec));
+ ut_ad(comp == (page_is_comp(block->page.frame) != 0));
+
+ if (page_zip_des_t *page_zip= compressed
+ ? buf_block_get_page_zip(block) : nullptr)
+ {
+ ut_ad(comp);
+ rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (rec_get_status(rec) != REC_STATUS_SUPREMUM)
+ page_zip_rec_set_owned(block, rec, n_owned, mtr);
+ }
+ else
+ {
+ rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) |
+ (n_owned << REC_N_OWNED_SHIFT));
+ }
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out] block clustered index root page
+@param[in] autoinc next available AUTO_INCREMENT value
+@param[in,out] mtr mini-transaction
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+page_set_autoinc(
+ buf_block_t* block,
+ ib_uint64_t autoinc,
+ mtr_t* mtr,
+ bool reset)
+ MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ node_seq_t ssn_id, /*!< in: split sequence id */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+
+#endif /* !UNIV_INNOCHECKSUM */
+/** Read a page header field. */
+inline uint16_t page_header_get_field(const page_t *page, ulint field)
+{
+ ut_ad(field <= PAGE_INDEX_ID);
+ ut_ad(!(field & 1));
+ return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field) \
+ (page_header_get_offs(page, field) \
+ ? page + page_header_get_offs(page, field) : NULL)
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out] block file page
+@param[in,out] mtr mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+ const page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+inline rec_t *page_rec_get_nth(page_t* page, ulint nth)
+{
+ return const_cast<rec_t*>(page_rec_get_nth_const(page, nth));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+ const page_t* page); /*!< in: page */
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+ const page_t* page); /*!< in: page */
+
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+ const page_t* page); /*!< in: index page */
+
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec);
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page); /*!< in: index page */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page); /*!< in: index page */
+/** Gets the pointer to a directory slot.
+@param n sparse directory slot number
+@return pointer to the sparse directory slot */
+inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n)
+{
+ ut_ad(page_dir_get_n_slots(page) > n);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+ return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2);
+}
+inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n)
+{
+ return page_dir_get_nth_slot(const_cast<page_t*>(page), n);
+}
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec); /*!< in: record */
+/** Get the record pointed to by a directory slot.
+@param[in] slot directory slot
+@return pointer to record */
+inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot)
+{
+ return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot));
+}
+inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
+{
+ return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
+}
+
+inline rec_t *page_dir_slot_get_rec_validate(page_dir_slot_t *slot)
+{
+ const size_t s= mach_read_from_2(my_assume_aligned<2>(slot));
+ page_t *page= page_align(slot);
+
+ return UNIV_LIKELY(s >= PAGE_NEW_INFIMUM &&
+ s <= page_header_get_field(page, PAGE_HEAP_TOP))
+ ? page + s
+ : nullptr;
+}
+inline const rec_t *page_dir_slot_get_rec_validate(const page_dir_slot_t *slot)
+{
+ return page_dir_slot_get_rec_validate(const_cast<rec_t*>(slot));
+}
+
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot); /*!< in: page directory slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs); /*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec); /*!< in: the physical record */
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec); /*!< in: the physical record */
+/** Determine whether a page has any siblings.
+@param[in] page page frame
+@return true if the page has any siblings */
+inline bool page_has_siblings(const page_t* page)
+{
+ compile_time_assert(!(FIL_PAGE_PREV % 8));
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ return *reinterpret_cast<const uint64_t*>(page + FIL_PAGE_PREV)
+ != ~uint64_t(0);
+}
+
+/** Determine whether a page has a predecessor.
+@param[in] page page frame
+@return true if the page has a predecessor */
+inline bool page_has_prev(const page_t* page)
+{
+ return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_PREV)
+ != FIL_NULL;
+}
+
+/** Determine whether a page has a successor.
+@param[in] page page frame
+@return true if the page has a successor */
+inline bool page_has_next(const page_t* page)
+{
+ return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_NEXT)
+ != FIL_NULL;
+}
+
+/** Read the AUTO_INCREMENT value from a clustered index root page.
+@param[in] page clustered index root page
+@return the persisted AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline uint64_t page_get_autoinc(const page_t *page)
+{
+ ut_d(uint16_t page_type= fil_page_get_type(page));
+ ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT);
+ ut_ad(!page_has_siblings(page));
+ const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC);
+ return mach_read_from_8(p);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp); /*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record
+@retval nullptr on error */
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record, must not be page
+ infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@param rec record (not page infimum)
+@return pointer to previous record
+@retval nullptr on error */
+inline rec_t *page_rec_get_prev(rec_t *rec)
+{
+ return const_cast<rec_t*>(page_rec_get_prev_const(rec));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((const));
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+ const page_t* page); /*!< in: index page */
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in] ptr pointer to PAGE_DIRECTION_B
+@return the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr);
+
+/** Read the PAGE_DIRECTION field.
+@param[in] page index page
+@return the value of the PAGE_DIRECTION field */
+inline
+byte
+page_get_direction(const page_t* page)
+{
+ return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in] page index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page);
+
+/** Create an uncompressed index page.
+@param[in,out] block buffer block
+@param[in,out] mtr mini-transaction
+@param[in] comp set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp);
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame
+ where the page is created */
+ dict_index_t* index, /*!< in: the index of the
+ page */
+ ulint level, /*!< in: the B-tree level of
+ the page */
+ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */
+ mtr_t* mtr); /*!< in/out: mini-transaction
+ handle */
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+ buf_block_t* block, /*!< in/out: B-tree block */
+ dict_index_t* index, /*!< in: the index of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(1,2)));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return error code */
+dberr_t
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+dberr_t
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/** Create an index page.
+@param[in,out] block buffer block
+@param[in] comp nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp);
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets);/*!< in: record descriptor */
+# ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn); /*!< in: print rn first and last records
+ in directory */
+# endif /* UNIV_BTR_PRINT */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+ const page_t* page); /*!< in: index page */
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+ const page_t* page); /*!< in: index page in ROW_FORMAT=REDUNDANT */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+ const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */
+/** Check the consistency of an index page.
+@param[in] page index page
+@param[in] index B-tree or R-tree index
+@return whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no);/*!< in: heap number */
+/** Get the last non-delete-marked record on a page.
+@param[in] page index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t *page_find_rec_last_not_deleted(const page_t *page);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "page0page.inl"
+
+#endif
diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl
new file mode 100644
index 00000000..6c0167ed
--- /dev/null
+++ b/storage/innobase/include/page0page.inl
@@ -0,0 +1,550 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+#include "rem0cmp.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(block);
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(trx_id);
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+ if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) {
+
+ page_set_max_trx_id(block, page_zip, trx_id, mtr);
+ }
+}
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page);
+
+ return(static_cast<node_seq_t>(
+ mach_read_from_8(page + FIL_RTREE_SPLIT_SEQ_NUM)));
+}
+
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ node_seq_t ssn_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_zip || page_zip == &block->page.zip);
+ constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
+ byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+ if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
+ UNIV_LIKELY_NULL(page_zip))
+ memcpy_aligned<2>(&page_zip->data[field], b, 8);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+{
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ uint16_t offs = page_header_get_field(page, field);
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ return(offs);
+}
+
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out] block file page
+@param[in,out] mtr mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+{
+ constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
+ byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+ if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memset_aligned<2>(&block->page.zip.data[field], 0, 2);
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ if (page_rec_is_comp(rec)) {
+ return(rec_get_heap_no_new(rec));
+ } else {
+ return(rec_get_heap_no_old(rec));
+ }
+}
+
+/** Determine whether an index page record is a user record.
+@param[in] rec record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec)
+{
+ ut_ad(page_rec_check(rec));
+ return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the supremum record.
+@param[in] rec record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec)
+{
+ ut_ad(page_rec_check(rec));
+ return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the infimum record.
+@param[in] rec record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec)
+{
+ ut_ad(page_rec_check(rec));
+ return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page_get_n_recs(page) > 0);
+
+ return(page_rec_get_next_const(page_get_infimum_rec(page)) == rec);
+}
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page_get_n_recs(page) > 0);
+
+ return(page_rec_get_next_const(rec) == page_get_supremum_rec(page));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page) /*!< in: page */
+{
+ ulint middle = (ulint(page_get_n_recs(page))
+ + PAGE_HEAP_NO_USER_LOW) / 2;
+
+ return(page_rec_get_nth(page, middle));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return mach_read_from_4(my_assume_aligned<2>
+ (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec) /*!< in: record */
+{
+ const page_t* page = page_align(rec);
+
+ ut_a(rec);
+
+ ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+ ut_a(page_offset(rec) >= PAGE_DATA);
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot) /*!< in: page directory slot */
+{
+ const rec_t* rec = page_dir_slot_get_rec(slot);
+ if (page_rec_is_comp(slot)) {
+ return(rec_get_n_owned_new(rec));
+ } else {
+ return(rec_get_n_owned_old(rec));
+ }
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs) /*!< in: number of records */
+{
+ return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+ / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ const page_t *page= page_align(rec);
+ ut_ad(page_rec_check(rec));
+ ulint offs= rec_get_next_offs(rec, comp);
+ if (!offs)
+ return nullptr;
+ if (UNIV_UNLIKELY(offs < (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)))
+ return nullptr;
+ if (UNIV_UNLIKELY(offs > page_header_get_field(page, PAGE_HEAP_TOP)))
+ return nullptr;
+ ut_ad(page_rec_is_infimum(rec) ||
+ (!page_is_leaf(page) && !page_has_prev(page)) ||
+ !(rec_get_info_bits(page + offs, comp) & REC_INFO_MIN_REC_FLAG));
+ return page + offs;
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ rec_t* rec) /*!< in: pointer to record */
+{
+ return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record */
+{
+ return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+#endif /* UNIV_INNOCHECKSUM */
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+ const page_t* page) /*!< in: index page */
+{
+ unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP)
+ - (page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END)
+ - page_header_get_field(page, PAGE_GARBAGE);
+ ut_ad(ret < srv_page_size);
+ return static_cast<uint16_t>(ret);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ if (comp) {
+ return((ulint)(srv_page_size
+ - PAGE_NEW_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+ }
+
+ return((ulint)(srv_page_size
+ - PAGE_OLD_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ if (page_is_comp(page)) {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_NEW_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(TRUE);
+ } else {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_OLD_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(FALSE);
+ }
+
+ /* Above the 'n_recs +' part reserves directory space for the new
+ inserted records; the '- 2' excludes page infimum and supremum
+ records */
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in] ptr pointer to PAGE_DIRECTION_B
+@return the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr)
+{
+ ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B);
+ return *ptr & ((1U << 3) - 1);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in] page index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page)
+{
+ uint16_t i = page_header_get_field(page, PAGE_INSTANT);
+#ifdef UNIV_DEBUG
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_INSTANT:
+ ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION);
+ ut_ad(i >> 3);
+ break;
+ case FIL_PAGE_INDEX:
+ ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page));
+ break;
+ case FIL_PAGE_RTREE:
+ ut_ad(i <= PAGE_NO_DIRECTION);
+ break;
+ default:
+ ut_ad("invalid page type" == 0);
+ break;
+ }
+#endif /* UNIV_DEBUG */
+ return static_cast<uint16_t>(i >> 3); /* i / 8 */
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
new file mode 100644
index 00000000..83fc45cd
--- /dev/null
+++ b/storage/innobase/include/page0types.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "dict0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "ut0new.h"
+
+#include <map>
+
+/** Eliminates a name collision on HP-UX */
+#define page_t ib_page_t
+/** Type of the index page */
+typedef byte page_t;
+#ifndef UNIV_INNOCHECKSUM
+/** Index page cursor */
+struct page_cur_t;
+/** Buffer pool block */
+struct buf_block_t;
+
+/** Compressed index page */
+typedef byte page_zip_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX \
+ (UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/* Page cursor search modes; the values must be in this order! */
+enum page_cur_mode_t {
+ PAGE_CUR_UNSUPP = 0,
+ PAGE_CUR_G = 1,
+ PAGE_CUR_GE = 2,
+ PAGE_CUR_L = 3,
+ PAGE_CUR_LE = 4,
+
+/* PAGE_CUR_LE_OR_EXTENDS = 5,*/ /* This is a search mode used in
+ "column LIKE 'abc%' ORDER BY column DESC";
+ we have to find strings which are <= 'abc' or
+ which extend it */
+
+/* These search mode is for search R-tree index. */
+ PAGE_CUR_CONTAIN = 7,
+ PAGE_CUR_INTERSECT = 8,
+ PAGE_CUR_WITHIN = 9,
+ PAGE_CUR_DISJOINT = 10,
+ PAGE_CUR_MBR_EQUAL = 11,
+ PAGE_CUR_RTREE_INSERT = 12,
+ PAGE_CUR_RTREE_LOCATE = 13,
+ PAGE_CUR_RTREE_GET_FATHER = 14
+};
+
+class buf_pool_t;
+class buf_page_t;
+
+/** Compressed page descriptor */
+struct page_zip_des_t
+{
+ page_zip_t* data; /*!< compressed page data */
+
+ uint32_t m_end:16; /*!< end offset of modification log */
+ uint32_t m_nonempty:1; /*!< TRUE if the modification log
+ is not empty */
+ uint32_t n_blobs:12; /*!< number of externally stored
+ columns on the page; the maximum
+ is 744 on a 16 KiB page */
+ uint32_t ssize:PAGE_ZIP_SSIZE_BITS;
+ /*!< 0 or compressed page shift size;
+ the size in bytes is
+ (UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+#ifdef UNIV_DEBUG
+ uint16_t m_start; /*!< start offset of modification log */
+ bool m_external; /*!< Allocated externally, not from the
+ buffer pool */
+#endif /* UNIV_DEBUG */
+
+ void clear() {
+ /* Clear everything except the member "fix". */
+ memset((void*) this, 0,
+ reinterpret_cast<char*>(&fix)
+ - reinterpret_cast<char*>(this));
+ }
+
+ page_zip_des_t() = default;
+ page_zip_des_t(const page_zip_des_t&) = default;
+
+ /* Initialize everything except the member "fix". */
+ page_zip_des_t(const page_zip_des_t& old, bool) {
+ memcpy((void*) this, (void*) &old,
+ reinterpret_cast<char*>(&fix)
+ - reinterpret_cast<char*>(this));
+ }
+
+private:
+ friend buf_pool_t;
+ friend buf_page_t;
+ /** fix count and state used in buf_page_t */
+ Atomic_relaxed<uint32_t> fix;
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_t {
+ /** Number of page compressions */
+ ulint compressed;
+ /** Number of successful page compressions */
+ ulint compressed_ok;
+ /** Number of page decompressions */
+ ulint decompressed;
+ /** Duration of page compressions in microseconds */
+ ib_uint64_t compressed_usec;
+ /** Duration of page decompressions in microseconds */
+ ib_uint64_t decompressed_usec;
+ page_zip_stat_t() :
+ /* Initialize members to 0 so that when we do
+ stlmap[key].compressed++ and element with "key" does not
+ exist it gets inserted with zeroed members. */
+ compressed(0),
+ compressed_ok(0),
+ decompressed(0),
+ compressed_usec(0),
+ decompressed_usec(0)
+ { }
+};
+
+/** Compression statistics types */
+typedef std::map<
+ index_id_t,
+ page_zip_stat_t,
+ std::less<index_id_t>,
+ ut_allocator<std::pair<const index_id_t, page_zip_stat_t> > >
+ page_zip_stat_per_index_t;
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by dict_index_t::id */
+extern page_zip_stat_per_index_t page_zip_stat_per_index;
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+#endif /* !UNIV_INNOCHECKSUM */
+#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
new file mode 100644
index 00000000..43329906
--- /dev/null
+++ b/storage/innobase/include/page0zip.h
@@ -0,0 +1,383 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#include "buf0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/* Compression level to be used by zlib. Settable by user. */
+extern uint page_zip_level;
+
+/* Default compression level. */
+#define DEFAULT_COMPRESSION_LEVEL 6
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE 2
+/** Predefine the sum of DIR_SLOT, TRX_ID & ROLL_PTR */
+#define PAGE_ZIP_CLUST_LEAF_SLOT_SIZE \
+ (PAGE_ZIP_DIR_SLOT_SIZE \
+ + DATA_TRX_ID_LEN \
+ + DATA_ROLL_PTR_LEN)
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK 0x3fffU
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000U
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL 0x8000U
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size); /*!< in: size in bytes */
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in] rec_size length of the record in bytes
+@param[in] comp nonzero=compact format
+@param[in] n_fields number of fields in the record; ignored if
+tablespace is not compressed
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+ ulint zip_size)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+ MY_ATTRIBUTE((const));
+
+/** Check whether a tuple is too big for compressed table
+@param[in] index dict index object
+@param[in] entry entry for the index
+@return true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+ const dict_index_t* index,
+ const dtuple_t* entry);
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+#define page_zip_des_init(page_zip) (page_zip)->clear()
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+ buf_block_t* block, /*!< in/out: buffer block */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint level, /*!< in: commpression level */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+ ulint n, /*!< in: number of fields
+ to compress */
+ const dict_index_t* index, /*!< in: index comprising
+ at least n fields */
+ ulint trx_id_pos,
+ /*!< in: position of the trx_id column
+ in the index, or ULINT_UNDEFINED if
+ this is a non-leaf page */
+ byte* buf); /*!< out: buffer of (n + 1) * 2 bytes */
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+ MY_ATTRIBUTE((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip); /*!< in: compressed page
+ descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index, /*!< in: index of the page, if known */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+ MY_ATTRIBUTE((nonnull(1,2)));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index) /*!< in: index of the page, if known */
+ MY_ATTRIBUTE((nonnull(1,2)));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] rec record in the uncompressed page
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] create nonzero=insert, zero=update
+@param[in,out] mtr mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ ulint create, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in,out] rec record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields)
+@param[in] trx_id DB_TRX_ID value (transaction identifier)
+@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer)
+@param[in,out] mtr mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+ buf_block_t* block,
+ byte* rec,
+ const rec_offs* offsets,
+ ulint trx_id_col,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr,
+ mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
+ byte* rec, /*!< in: record to insert */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(1,3,4)));
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] free previous start of the free list
+@param[in,out] mtr mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ const byte *free, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull(1,2,3,4,6)));
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
+page_zip_reorganize(
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint z_level,/*!< in: compression level */
+ mtr_t* mtr, /*!< in: mini-transaction */
+ bool restore = false)/*!< whether to restore on failure */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+ buf_block_t* block, /*!< in/out: buffer block */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr); /*!< in: mini-transaction */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param data compressed page
+@param size size of compressed page
+@param use_adler whether to use Adler32 instead of a XOR of 3 CRC-32C
+@return page checksum */
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler);
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data ROW_FORMAT=COMPRESSED page
+@param size size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size);
+
+#ifndef UNIV_INNOCHECKSUM
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index();
+/*===========================*/
+
+#include "page0zip.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* page0zip_h */
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
new file mode 100644
index 00000000..afc877c3
--- /dev/null
+++ b/storage/innobase/include/page0zip.inl
@@ -0,0 +1,317 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list. The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page. The infimum and supremum records are
+excluded. The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored. We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+ - (heap_no - 1) << 1 (1..2 bytes)
+ - extra bytes backwards
+ - data bytes
+- clear record:
+ - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+ - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+ - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+ - indexed by heap_no
+ - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+ - REC_NODE_PTR_SIZE for non-leaf pages
+ - 0 otherwise
+(8) dense page directory, stored backwards
+ - n_dense = n_heap - 2
+ - existing records in ascending collation order
+ - deleted records (free list) in link order
+*/
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size;
+
+ if (!page_zip->ssize) {
+ return(0);
+ }
+
+ size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
+
+ ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size <= srv_page_size);
+
+ return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size) /*!< in: size in bytes */
+{
+ if (size) {
+ unsigned ssize;
+
+ ut_ad(ut_is_2pow(size));
+
+ for (ssize = 1; size > (512U << ssize); ssize++) {
+ }
+
+ page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1);
+ } else {
+ page_zip->ssize = 0;
+ }
+
+ ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in] rec_size length of the record in bytes
+@param[in] comp nonzero=compact format
+@param[in] n_fields number of fields in the record; ignored if
+tablespace is not compressed
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+ ulint zip_size)
+{
+ /* FIXME: row size check is this function seems to be the most correct.
+ Put it in a separate function and use in more places of InnoDB */
+
+ ut_ad(rec_size
+ > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES));
+ ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE
+ if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE :
+ rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) {
+ return(TRUE);
+ }
+#endif
+
+ if (zip_size) {
+ ut_ad(comp);
+ /* On a compressed page, there is a two-byte entry in
+ the dense page directory for every record. But there
+ is no record header. There should be enough room for
+ one record on an empty leaf page. Subtract 1 byte for
+ the encoded heap number. Check also the available space
+ on the uncompressed page. */
+ return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
+ >= page_zip_empty_size(n_fields, zip_size)
+ || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+ }
+
+ return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */
+{
+ ut_ad(page_zip);
+ ut_ad(page_zip->data);
+ ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+ ut_ad(page_zip->m_start <= page_zip->m_end);
+ ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+ ut_ad(page_zip->n_blobs
+ < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+{
+ ulint uncompressed_size;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ if (!page_is_leaf(page_zip->data)) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + REC_NODE_PTR_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ } else if (is_clust) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ } else {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ }
+
+ return (ulint(page_dir_get_n_heap(page_zip->data)) - 2)
+ * uncompressed_size
+ + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE;
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+{
+ ulint trailer_len;
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+
+ return(lint(page_zip_get_size(page_zip)
+ - trailer_len - page_zip->m_end
+ - (REC_N_NEW_EXTRA_BYTES - 2)));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+{
+ ulint trailer_len;
+
+ ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+ /* Subtract the fixed extra bytes and add the maximum
+ space needed for identifying the record (encoded heap_no). */
+ length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+ if (create > 0) {
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ return(length + trailer_len + page_zip->m_end
+ < page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index()
+/*===========================*/
+{
+ mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index.clear();
+ mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+}
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
new file mode 100644
index 00000000..e7112d99
--- /dev/null
+++ b/storage/innobase/include/pars0grm.h
@@ -0,0 +1,151 @@
+/* A Bison parser, made by GNU Bison 3.7.6. */
+
+/* Bison interface for Yacc-like parsers in C
+
+ Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
+ Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* As a special exception, you may create a larger work that contains
+ part or all of the Bison parser skeleton and distribute that work
+ under terms of your choice, so long as that work isn't itself a
+ parser generator using the skeleton or a modified version thereof
+ as a parser skeleton. Alternatively, if you modify or redistribute
+ the parser skeleton itself, you may (at your option) remove this
+ special exception, which will cause the skeleton and the resulting
+ Bison output files to be licensed under the GNU General Public
+ License without this special exception.
+
+ This special exception was added by the Free Software Foundation in
+ version 2.2 of Bison. */
+
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+ especially those whose name start with YY_ or yy_. They are
+ private implementation details that can be changed or removed. */
+
+#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
+# define YY_YY_PARS0GRM_TAB_H_INCLUDED
+/* Debug traces. */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
+
+/* Token kinds. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ enum yytokentype
+ {
+ YYEMPTY = -2,
+ YYEOF = 0, /* "end of file" */
+ YYerror = 256, /* error */
+ YYUNDEF = 257, /* "invalid token" */
+ PARS_INT_LIT = 258, /* PARS_INT_LIT */
+ PARS_FLOAT_LIT = 259, /* PARS_FLOAT_LIT */
+ PARS_STR_LIT = 260, /* PARS_STR_LIT */
+ PARS_NULL_LIT = 261, /* PARS_NULL_LIT */
+ PARS_ID_TOKEN = 262, /* PARS_ID_TOKEN */
+ PARS_AND_TOKEN = 263, /* PARS_AND_TOKEN */
+ PARS_OR_TOKEN = 264, /* PARS_OR_TOKEN */
+ PARS_NOT_TOKEN = 265, /* PARS_NOT_TOKEN */
+ PARS_GE_TOKEN = 266, /* PARS_GE_TOKEN */
+ PARS_LE_TOKEN = 267, /* PARS_LE_TOKEN */
+ PARS_NE_TOKEN = 268, /* PARS_NE_TOKEN */
+ PARS_PROCEDURE_TOKEN = 269, /* PARS_PROCEDURE_TOKEN */
+ PARS_IN_TOKEN = 270, /* PARS_IN_TOKEN */
+ PARS_INT_TOKEN = 271, /* PARS_INT_TOKEN */
+ PARS_CHAR_TOKEN = 272, /* PARS_CHAR_TOKEN */
+ PARS_IS_TOKEN = 273, /* PARS_IS_TOKEN */
+ PARS_BEGIN_TOKEN = 274, /* PARS_BEGIN_TOKEN */
+ PARS_END_TOKEN = 275, /* PARS_END_TOKEN */
+ PARS_IF_TOKEN = 276, /* PARS_IF_TOKEN */
+ PARS_THEN_TOKEN = 277, /* PARS_THEN_TOKEN */
+ PARS_ELSE_TOKEN = 278, /* PARS_ELSE_TOKEN */
+ PARS_ELSIF_TOKEN = 279, /* PARS_ELSIF_TOKEN */
+ PARS_LOOP_TOKEN = 280, /* PARS_LOOP_TOKEN */
+ PARS_WHILE_TOKEN = 281, /* PARS_WHILE_TOKEN */
+ PARS_RETURN_TOKEN = 282, /* PARS_RETURN_TOKEN */
+ PARS_SELECT_TOKEN = 283, /* PARS_SELECT_TOKEN */
+ PARS_COUNT_TOKEN = 284, /* PARS_COUNT_TOKEN */
+ PARS_FROM_TOKEN = 285, /* PARS_FROM_TOKEN */
+ PARS_WHERE_TOKEN = 286, /* PARS_WHERE_TOKEN */
+ PARS_FOR_TOKEN = 287, /* PARS_FOR_TOKEN */
+ PARS_DDOT_TOKEN = 288, /* PARS_DDOT_TOKEN */
+ PARS_ORDER_TOKEN = 289, /* PARS_ORDER_TOKEN */
+ PARS_BY_TOKEN = 290, /* PARS_BY_TOKEN */
+ PARS_ASC_TOKEN = 291, /* PARS_ASC_TOKEN */
+ PARS_DESC_TOKEN = 292, /* PARS_DESC_TOKEN */
+ PARS_INSERT_TOKEN = 293, /* PARS_INSERT_TOKEN */
+ PARS_INTO_TOKEN = 294, /* PARS_INTO_TOKEN */
+ PARS_VALUES_TOKEN = 295, /* PARS_VALUES_TOKEN */
+ PARS_UPDATE_TOKEN = 296, /* PARS_UPDATE_TOKEN */
+ PARS_SET_TOKEN = 297, /* PARS_SET_TOKEN */
+ PARS_DELETE_TOKEN = 298, /* PARS_DELETE_TOKEN */
+ PARS_CURRENT_TOKEN = 299, /* PARS_CURRENT_TOKEN */
+ PARS_OF_TOKEN = 300, /* PARS_OF_TOKEN */
+ PARS_CREATE_TOKEN = 301, /* PARS_CREATE_TOKEN */
+ PARS_TABLE_TOKEN = 302, /* PARS_TABLE_TOKEN */
+ PARS_INDEX_TOKEN = 303, /* PARS_INDEX_TOKEN */
+ PARS_UNIQUE_TOKEN = 304, /* PARS_UNIQUE_TOKEN */
+ PARS_CLUSTERED_TOKEN = 305, /* PARS_CLUSTERED_TOKEN */
+ PARS_ON_TOKEN = 306, /* PARS_ON_TOKEN */
+ PARS_ASSIGN_TOKEN = 307, /* PARS_ASSIGN_TOKEN */
+ PARS_DECLARE_TOKEN = 308, /* PARS_DECLARE_TOKEN */
+ PARS_CURSOR_TOKEN = 309, /* PARS_CURSOR_TOKEN */
+ PARS_SQL_TOKEN = 310, /* PARS_SQL_TOKEN */
+ PARS_OPEN_TOKEN = 311, /* PARS_OPEN_TOKEN */
+ PARS_FETCH_TOKEN = 312, /* PARS_FETCH_TOKEN */
+ PARS_CLOSE_TOKEN = 313, /* PARS_CLOSE_TOKEN */
+ PARS_NOTFOUND_TOKEN = 314, /* PARS_NOTFOUND_TOKEN */
+ PARS_TO_BINARY_TOKEN = 315, /* PARS_TO_BINARY_TOKEN */
+ PARS_SUBSTR_TOKEN = 316, /* PARS_SUBSTR_TOKEN */
+ PARS_CONCAT_TOKEN = 317, /* PARS_CONCAT_TOKEN */
+ PARS_INSTR_TOKEN = 318, /* PARS_INSTR_TOKEN */
+ PARS_LENGTH_TOKEN = 319, /* PARS_LENGTH_TOKEN */
+ PARS_COMMIT_TOKEN = 320, /* PARS_COMMIT_TOKEN */
+ PARS_ROLLBACK_TOKEN = 321, /* PARS_ROLLBACK_TOKEN */
+ PARS_WORK_TOKEN = 322, /* PARS_WORK_TOKEN */
+ PARS_EXIT_TOKEN = 323, /* PARS_EXIT_TOKEN */
+ PARS_FUNCTION_TOKEN = 324, /* PARS_FUNCTION_TOKEN */
+ PARS_LOCK_TOKEN = 325, /* PARS_LOCK_TOKEN */
+ PARS_SHARE_TOKEN = 326, /* PARS_SHARE_TOKEN */
+ PARS_MODE_TOKEN = 327, /* PARS_MODE_TOKEN */
+ PARS_LIKE_TOKEN = 328, /* PARS_LIKE_TOKEN */
+ PARS_LIKE_TOKEN_EXACT = 329, /* PARS_LIKE_TOKEN_EXACT */
+ PARS_LIKE_TOKEN_PREFIX = 330, /* PARS_LIKE_TOKEN_PREFIX */
+ PARS_LIKE_TOKEN_SUFFIX = 331, /* PARS_LIKE_TOKEN_SUFFIX */
+ PARS_LIKE_TOKEN_SUBSTR = 332, /* PARS_LIKE_TOKEN_SUBSTR */
+ PARS_TABLE_NAME_TOKEN = 333, /* PARS_TABLE_NAME_TOKEN */
+ PARS_BIGINT_TOKEN = 334, /* PARS_BIGINT_TOKEN */
+ NEG = 335 /* NEG */
+ };
+ typedef enum yytokentype yytoken_kind_t;
+#endif
+
+/* Value type. */
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+extern YYSTYPE yylval;
+
+int yyparse (void);
+
+#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED */
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
new file mode 100644
index 00000000..07a726ea
--- /dev/null
+++ b/storage/innobase/include/pars0opt.h
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node); /*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /*!< in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /*!< in: index to use */
+ sym_node_list_t* col_list, /*!< in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /*!< in: plan or NULL */
+ que_node_t* exp); /*!< in: expression or condition */
+#ifdef UNIV_SQL_DEBUG
+/********************************************************************//**
+Prints info of a query plan. */
+void
+opt_print_query_plan(
+/*=================*/
+ sel_node_t* sel_node); /*!< in: select node */
+#endif /* UNIV_SQL_DEBUG */
+
+#endif
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
new file mode 100644
index 00000000..16823ce1
--- /dev/null
+++ b/storage/innobase/include/pars0pars.h
@@ -0,0 +1,695 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "row0mysql.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef ibool (*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int yydebug;
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t* pars_sym_tab_global;
+
+extern pars_res_word_t pars_to_binary_token;
+extern pars_res_word_t pars_substr_token;
+extern pars_res_word_t pars_concat_token;
+extern pars_res_word_t pars_length_token;
+extern pars_res_word_t pars_instr_token;
+extern pars_res_word_t pars_count_token;
+extern pars_res_word_t pars_int_token;
+extern pars_res_word_t pars_bigint_token;
+extern pars_res_word_t pars_char_token;
+extern pars_res_word_t pars_update_token;
+extern pars_res_word_t pars_asc_token;
+extern pars_res_word_t pars_desc_token;
+extern pars_res_word_t pars_open_token;
+extern pars_res_word_t pars_close_token;
+extern pars_res_word_t pars_share_token;
+extern pars_res_word_t pars_unique_token;
+extern pars_res_word_t pars_clustered_token;
+
+extern ulint pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT 0
+#define PARS_OUTPUT 1
+#define PARS_NOT_PARAM 2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+que_t*
+pars_sql(
+/*=====*/
+ pars_info_t* info, /*!< in: extra information, or NULL */
+ const char* str); /*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
+int
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /*!< in/out: buffer where to copy */
+ size_t max_size); /*!< in: maximum number of characters which fit
+ in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+void
+yyerror(
+/*====*/
+ const char* s); /*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type); /*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_func(
+/*======*/
+ que_node_t* res_word,/*!< in: function name reserved word */
+ que_node_t* arg); /*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return own: function node in a query tree */
+int
+pars_like_rebind(
+/*=============*/
+ sym_node_t* node, /* in: The search string node.*/
+ const byte* ptr, /* in: literal to (re) bind */
+ ulint len); /* in: length of literal to (re) bind*/
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_op(
+/*====*/
+ int func, /*!< in: operator token code */
+ que_node_t* arg1, /*!< in: first argument */
+ que_node_t* arg2); /*!< in: second argument or NULL for an unary
+ operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+order_node_t*
+pars_order_by(
+/*==========*/
+ sym_node_t* column, /*!< in: column name */
+ pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_list(
+/*=============*/
+ que_node_t* select_list, /*!< in: select list */
+ sym_node_t* into_list); /*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ sym_node_t* sym_node, /*!< in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+que_node_t*
+pars_function_declaration(
+/*======================*/
+ sym_node_t* sym_node); /*!< in: function id node in the symbol
+ table */
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ sel_node_t* select_node, /*!< in: select node already containing
+ the select list */
+ sym_node_t* table_list, /*!< in: table list */
+ que_node_t* search_cond, /*!< in: search condition or NULL */
+ pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */
+ pars_res_word_t* consistent_read,/*!< in: NULL or
+ &pars_consistent_token */
+ order_node_t* order_by); /*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ sym_node_t* column, /*!< in: column to assign */
+ que_node_t* exp); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ ibool is_delete, /*!< in: TRUE if delete */
+ sym_node_t* table_sym, /*!< in: table name node */
+ col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+ if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ upd_node_t* node, /*!< in: update node */
+ sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond); /*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ sym_node_t* table_sym, /*!< in: table name node */
+ que_node_t* values_list, /*!< in: value expression list or NULL */
+ sel_node_t* select); /*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+if_node_t*
+pars_if_statement(
+/*==============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list, /*!< in: statement list */
+ que_node_t* else_part); /*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+for_node_t*
+pars_for_statement(
+/*===============*/
+ sym_node_t* loop_var, /*!< in: loop variable */
+ que_node_t* loop_start_limit,/*!< in: loop start expression */
+ que_node_t* loop_end_limit, /*!< in: loop end expression */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+while_node_t*
+pars_while_statement(
+/*=================*/
+ que_node_t* cond, /*!< in: while-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+func_node_t*
+pars_procedure_call(
+/*================*/
+ que_node_t* res_word,/*!< in: procedure name reserved word */
+ que_node_t* args); /*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ sym_node_t* var, /*!< in: variable to assign */
+ que_node_t* val); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ sym_node_t* cursor, /*!< in: cursor node */
+ sym_node_t* into_list, /*!< in: variables to set, or NULL */
+ sym_node_t* user_func); /*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+open_node_t*
+pars_open_statement(
+/*================*/
+ ulint type, /*!< in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor); /*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ sel_node_t* sel_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+sym_node_t*
+pars_column_def(
+/*============*/
+ sym_node_t* sym_node, /*!< in: column node in the
+ symbol table */
+ pars_res_word_t* type, /*!< in: data type */
+ sym_node_t* len, /*!< in: length of column, or
+ NULL */
+ void* is_not_null); /*!< in: if not NULL, column
+ is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+tab_node_t*
+pars_create_table(
+/*==============*/
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_defs); /*!< in: list of column names */
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+ind_node_t*
+pars_create_index(
+/*==============*/
+ pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */
+ sym_node_t* index_sym, /*!< in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_list); /*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ sym_node_t* sym_node, /*!< in: procedure id node in the symbol
+ table */
+ que_node_t* stat_list); /*!< in: statement list */
+
+/** Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running.
+@param[in] node root node for an incomplete query
+ graph, or NULL for dummy graph
+@param[in] trx transaction handle
+@param[in] heap memory heap from which allocated
+@param[in] prebuilt row prebuilt structure
+@return query thread node to run */
+que_thr_t*
+pars_complete_graph_for_exec(
+ que_node_t* node,
+ trx_t* trx,
+ mem_heap_t* heap,
+ row_prebuilt_t* prebuilt)
+ MY_ATTRIBUTE((nonnull(2,3), warn_unused_result));
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Add bound literal. */
+void
+pars_info_add_literal(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const void* address, /*!< in: address */
+ ulint length, /*!< in: length of data */
+ ulint type, /*!< in: type, e.g. DATA_FIXBINARY */
+ ulint prtype); /*!< in: precise type, e.g.
+ DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+void
+pars_info_add_str_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* str); /*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_literal(
+/*===================*/
+ pars_info_t* info, /* in: info struct */
+ const char* name, /* in: name */
+ const void* address, /* in: address */
+ ulint length, /* in: length of data */
+ ulint type, /* in: type, e.g. DATA_FIXBINARY */
+ ulint prtype); /* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const byte* str, /*!< in: string */
+ ulint str_len); /*!< in: string length */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint32_t* val); /*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint64_t* val); /*!< in: value */
+/****************************************************************//**
+Add user function. */
+void
+pars_info_bind_function(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: function name */
+ pars_user_func_cb_t func, /*!< in: function address */
+ void* arg); /*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+void
+pars_info_bind_id(
+/*=============*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* id); /*!< in: id */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ulint val); /*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_ull_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ib_uint64_t val); /*!< in: value */
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint64_t* val) /*!< in: value */
+ MY_ATTRIBUTE((nonnull));
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_t {
+ mem_heap_t* heap; /*!< our own memory heap */
+
+ ib_vector_t* funcs; /*!< user functions, or NUll
+ (pars_user_func_t*) */
+ ib_vector_t* bound_lits; /*!< bound literals, or NULL
+ (pars_bound_lit_t*) */
+ ib_vector_t* bound_ids; /*!< bound ids, or NULL
+ (pars_bound_id_t*) */
+};
+
+inline void pars_info_free(pars_info_t *info) { mem_heap_free(info->heap); }
+
+/** User-supplied function and argument. */
+struct pars_user_func_t {
+ const char* name; /*!< function name */
+ pars_user_func_cb_t func; /*!< function address */
+ void* arg; /*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_t {
+ const char* name; /*!< name */
+ const void* address; /*!< address */
+ ulint length; /*!< length of data */
+ ulint type; /*!< type, e.g. DATA_FIXBINARY */
+ ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */
+ sym_node_t* node; /*!< symbol node */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_t {
+ const char* name; /*!< name */
+ const char* id; /*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_t{
+ int code; /*!< the token code for the reserved word from
+ pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_t{
+ que_common_t common; /*!< type: QUE_NODE_FUNC */
+ int func; /*!< token code of the function name */
+ ulint fclass; /*!< class of the function */
+ que_node_t* args; /*!< argument(s) of the function */
+ UT_LIST_NODE_T(func_node_t) cond_list;
+ /*!< list of comparison conditions; defined
+ only for comparison operator nodes except,
+ presently, for OPT_SCROLL_TYPE ones */
+ UT_LIST_NODE_T(func_node_t) func_node_list;
+ /*!< list of function nodes in a parsed
+ query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ORDER */
+ sym_node_t* column; /*!< order-by column */
+ ibool asc; /*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_t{
+ que_common_t common; /*!< type: QUE_NODE_PROC */
+ sym_node_t* proc_id; /*!< procedure name symbol in the symbol
+ table of this same procedure */
+ que_node_t* stat_list; /*!< statement list */
+ sym_tab_t* sym_tab; /*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ELSIF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_t{
+ que_common_t common; /*!< type: QUE_NODE_IF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+ que_node_t* else_part; /*!< else-part statement list */
+ elsif_node_t* elsif_list; /*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_t{
+ que_common_t common; /*!< type: QUE_NODE_WHILE */
+ que_node_t* cond; /*!< while condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_t{
+ que_common_t common; /*!< type: QUE_NODE_FOR */
+ sym_node_t* loop_var; /*!< loop variable: this is the
+ dereferenced symbol from the
+ variable declarations, not the
+ symbol occurrence in the for loop
+ definition */
+ que_node_t* loop_start_limit;/*!< initial value of loop variable */
+ que_node_t* loop_end_limit; /*!< end value of loop variable */
+ lint loop_end_value; /*!< evaluated value for the end value:
+ it is calculated only when the loop
+ is entered, and will not change within
+ the loop */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_t{
+ que_common_t common; /*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_t{
+ que_common_t common; /*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */
+ sym_node_t* var; /*!< variable to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_t{
+ que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */
+ sym_node_t* col; /*!< column to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */
+#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */
+#define PARS_FUNC_CMP 3 /*!< comparison operators */
+#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */
+#define PARS_FUNC_AGGREGATE 5 /*!< COUNT */
+#define PARS_FUNC_OTHER 6 /*!< these are not real functions,
+ e.g., := */
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
new file mode 100644
index 00000000..59f6cc31
--- /dev/null
+++ b/storage/innobase/include/pars0sym.h
@@ -0,0 +1,243 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ mem_heap_t* heap); /*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ ulint val); /*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const byte* str, /*!< in: string with no quotes around
+ it */
+ ulint len); /*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name, /*!< in: name of bound literal */
+ ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+ /* out: symbol table node */
+ sym_node_t* node, /* in: node that is bound to literal*/
+ const void* address, /* in: pointer to data */
+ ulint length); /* in: length of data */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* name, /*!< in: identifier name */
+ ulint len); /*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name); /*!< in: name of bound id */
+
+/** Index of sym_node_t::field_nos corresponding to the clustered index */
+#define SYM_CLUST_FIELD_NO 0
+/** Index of sym_node_t::field_nos corresponding to a secondary index */
+#define SYM_SEC_FIELD_NO 1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+ SYM_UNSET, /*!< Unset entry. */
+ SYM_VAR = 91, /*!< declared parameter or local
+ variable of a procedure */
+ SYM_IMPLICIT_VAR, /*!< storage for a intermediate result
+ of a calculation */
+ SYM_LIT, /*!< literal */
+ SYM_TABLE_REF_COUNTED, /*!< database table name, ref counted. Must
+ be closed explicitly. */
+ SYM_TABLE, /*!< database table name */
+ SYM_COLUMN, /*!< database table name */
+ SYM_CURSOR, /*!< named cursor */
+ SYM_PROCEDURE_NAME, /*!< stored procedure name */
+ SYM_INDEX, /*!< database index name */
+ SYM_FUNCTION /*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_t{
+ que_common_t common; /*!< node type:
+ QUE_NODE_SYMBOL */
+ /* NOTE: if the data field in 'common.val' is not NULL and the symbol
+ table node is not for a temporary column, the memory for the value has
+ been allocated from dynamic memory and it should be freed when the
+ symbol table is discarded */
+
+ /* 'alias' and 'indirection' are almost the same, but not quite.
+ 'alias' always points to the primary instance of the variable, while
+ 'indirection' does the same only if we should use the primary
+ instance's values for the node's data. This is usually the case, but
+ when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+ t WHERE id = x;"), we copy the values from the primary instance to
+ the cursor's instance so that they are fixed for the duration of the
+ cursor, and set 'indirection' to NULL. If we did not, the value of
+ 'x' could change between fetches and things would break horribly.
+
+ TODO: It would be cleaner to make 'indirection' a boolean field and
+ always use 'alias' to refer to the primary node. */
+
+ sym_node_t* indirection; /*!< pointer to
+ another symbol table
+ node which contains
+ the value for this
+ node, NULL otherwise */
+ sym_node_t* alias; /*!< pointer to
+ another symbol table
+ node for which this
+ node is an alias,
+ NULL otherwise */
+ UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table
+ columns or a list of
+ input variables for an
+ explicit cursor */
+ ibool copy_val; /*!< TRUE if a column
+ and its value should
+ be copied to dynamic
+ memory when fetched */
+ ulint field_nos[2]; /*!< if a column, in
+ the position
+ SYM_CLUST_FIELD_NO is
+ the field number in the
+ clustered index; in
+ the position
+ SYM_SEC_FIELD_NO
+ the field number in the
+ non-clustered index to
+ use first; if not found
+ from the index, then
+ ULINT_UNDEFINED */
+ ibool resolved; /*!< TRUE if the
+ meaning of a variable
+ or a column has been
+ resolved; for literals
+ this is always TRUE */
+ enum sym_tab_entry token_type; /*!< type of the
+ parsed token */
+ const char* name; /*!< name of an id */
+ ulint name_len; /*!< id name length */
+ dict_table_t* table; /*!< table definition
+ if a table id or a
+ column id */
+ ulint col_no; /*!< column number if a
+ column */
+ sel_buf_t* prefetch_buf; /*!< NULL, or a buffer
+ for cached column
+ values for prefetched
+ rows */
+ sel_node_t* cursor_def; /*!< cursor definition
+ select node if a
+ named cursor */
+ ulint param_type; /*!< PARS_INPUT,
+ PARS_OUTPUT, or
+ PARS_NOT_PARAM if not a
+ procedure parameter */
+ sym_tab_t* sym_table; /*!< back pointer to
+ the symbol table */
+ UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol
+ nodes */
+ sym_node_t* like_node; /* LIKE operator node*/
+};
+
+/** Symbol table */
+struct sym_tab_t{
+ que_t* query_graph;
+ /*!< query graph generated by the
+ parser */
+ const char* sql_string;
+ /*!< SQL string to parse */
+ size_t string_len;
+ /*!< SQL string length */
+ size_t next_char_pos;
+ /*!< position of the next character in
+ sql_string to give to the lexical
+ analyzer */
+ pars_info_t* info; /*!< extra information, or NULL */
+ sym_node_list_t sym_list;
+ /*!< list of symbol nodes in the symbol
+ table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ func_node_list;
+ /*!< list of function nodes in the
+ parsed query graph */
+ mem_heap_t* heap; /*!< memory heap from which we can
+ allocate space */
+};
+
+#endif
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
new file mode 100644
index 00000000..f5b69522
--- /dev/null
+++ b/storage/innobase/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+struct pars_info_t;
+struct pars_user_func_t;
+struct pars_bound_lit_t;
+struct pars_bound_id_t;
+struct sym_node_t;
+struct sym_tab_t;
+struct pars_res_word_t;
+struct func_node_t;
+struct order_node_t;
+struct proc_node_t;
+struct elsif_node_t;
+struct if_node_t;
+struct while_node_t;
+struct for_node_t;
+struct exit_node_t;
+struct return_node_t;
+struct assign_node_t;
+struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t;
+
+#endif
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
new file mode 100644
index 00000000..c60f390a
--- /dev/null
+++ b/storage/innobase/include/que0que.h
@@ -0,0 +1,314 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "data0data.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+que_fork_t *que_fork_create(mem_heap_t* heap);
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent);/*!< in: parent */
+/** Creates a query graph thread node.
+@param[in] parent parent node, i.e., a fork node
+@param[in] heap memory heap where created
+@param[in] prebuilt row prebuilt structure
+@return own: query thread node */
+que_thr_t*
+que_thr_create(
+ que_fork_t* parent,
+ mem_heap_t* heap,
+ row_prebuilt_t* prebuilt);
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node); /*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+void
+que_graph_free(
+/*===========*/
+ que_t* graph); /*!< in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ que_fork_t* fork); /*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ const que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size); /*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node); /*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node); /*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+ que_node_t* node); /*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node); /*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+ /* out: node last node from list.*/
+ que_node_t* node_list); /* in: node list, or NULL */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list); /*!< in: node list, or NULL */
+/*********************************************************************//**
+Evaluate the given SQL
+@return error code or DB_SUCCESS */
+dberr_t
+que_eval_sql(
+/*=========*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string */
+ trx_t* trx); /*!< in: trx */
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+ que_fork_t* fork, /*!< in: a query fork */
+ que_thr_t* thr); /*!< in: current pos */
+
+/** Query thread states */
+enum que_thr_state_t {
+ /** in selects this means that the thread is at the end of its
+ result set (or start, in case of a scroll cursor); in other
+ statements, this means the thread has done its task */
+ QUE_THR_COMPLETED,
+ QUE_THR_RUNNING
+};
+
+/** Query thread lock states */
+enum que_thr_lock_t {
+ QUE_THR_LOCK_NOLOCK,
+ QUE_THR_LOCK_ROW,
+ QUE_THR_LOCK_TABLE
+};
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
+
+struct que_thr_t{
+ que_common_t common; /*!< type: QUE_NODE_THR */
+ que_node_t* child; /*!< graph child node */
+ que_t* graph; /*!< graph where this node belongs */
+ que_thr_state_t state; /*!< state of the query thread */
+ /*------------------------------*/
+ /* The following fields are private to the OS thread executing the
+ query thread, and are not protected by any mutex: */
+
+ que_node_t* run_node; /*!< pointer to the node where the
+ subgraph down from this node is
+ currently executed */
+ que_node_t* prev_node; /*!< pointer to the node from which
+ the control came */
+ ulint resource; /*!< resource usage of the query thread
+ thus far */
+ ulint lock_state; /*!< lock state of thread (table or
+ row) */
+ /*------------------------------*/
+ /* The following fields are links for the various lists that
+ this type can be on. */
+ UT_LIST_NODE_T(que_thr_t)
+ thrs; /*!< list of thread nodes of the fork
+ node */
+ UT_LIST_NODE_T(que_thr_t)
+ queue; /*!< list of runnable thread nodes in
+ the server task queue */
+ ulint fk_cascade_depth; /*!< maximum cascading call depth
+ supported for foreign key constraint
+ related delete/updates */
+ row_prebuilt_t* prebuilt; /*!< prebuilt structure processed by
+ the query thread */
+};
+
+/* Query graph fork node: its fields are protected by the query thread mutex */
+struct que_fork_t{
+ que_common_t common; /*!< type: QUE_NODE_FORK */
+ que_t* graph; /*!< query graph of this node */
+ trx_t* trx; /*!< transaction: this is set only in
+ the root node */
+ ulint state; /*!< state of the fork node */
+ que_thr_t* caller; /*!< pointer to a possible calling query
+ thread */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ thrs; /*!< list of query threads */
+ /*------------------------------*/
+ /* The fields in this section are defined only in the root node */
+ sym_tab_t* sym_tab; /*!< symbol table of the query,
+ generated by the parser, or NULL
+ if the graph was created 'by hand' */
+ pars_info_t* info; /*!< info struct, or NULL */
+
+ sel_node_t* last_sel_node; /*!< last executed select node, or NULL
+ if none */
+ UT_LIST_NODE_T(que_fork_t)
+ graphs; /*!< list of query graphs of a session
+ or a stored procedure */
+ /*------------------------------*/
+ mem_heap_t* heap; /*!< memory heap where the fork was
+ created */
+
+};
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE 1
+#define QUE_FORK_COMMAND_WAIT 2
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT 1024
+
+#include "que0que.inl"
+
+#endif
diff --git a/storage/innobase/include/que0que.inl b/storage/innobase/include/que0que.inl
new file mode 100644
index 00000000..e21cbad3
--- /dev/null
+++ b/storage/innobase/include/que0que.inl
@@ -0,0 +1,245 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr);
+
+ return(thr->graph->trx);
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ que_thr_t* thr;
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ const que_node_t* node) /*!< in: graph node */
+{
+ return(reinterpret_cast<const que_common_t*>(node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(&(((que_common_t*) node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*) node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size) /*!< in: size */
+{
+ ut_ad(node);
+
+ ((que_common_t*) node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent) /*!< in: parent */
+{
+ ut_ad(node);
+
+ ((que_common_t*) node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node) /*!< in: node */
+{
+ que_common_t* cnode;
+ que_common_t* cnode2;
+
+ cnode = (que_common_t*) node;
+
+ cnode->brother = NULL;
+
+ if (node_list == NULL) {
+
+ return(node);
+ }
+
+ cnode2 = (que_common_t*) node_list;
+
+ while (cnode2->brother != NULL) {
+ cnode2 = (que_common_t*) cnode2->brother;
+ }
+
+ cnode2->brother = node;
+
+ return(node_list);
+}
+
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+ /* out: last node in list.*/
+ que_node_t* node_list) /* in: node list */
+{
+ que_common_t* node;
+
+ ut_a(node_list != NULL);
+
+ node = (que_common_t*) node_list;
+
+ /* We need the last element */
+ while (node->brother != NULL) {
+ node = (que_common_t*) node->brother;
+ }
+
+ return(node);
+}
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node) /*!< in: node in a list */
+{
+ return(((que_common_t*) node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list) /*!< in: node list, or NULL */
+{
+ const que_common_t* cnode;
+ ulint len;
+
+ cnode = (const que_common_t*) node_list;
+ len = 0;
+
+ while (cnode != NULL) {
+ len++;
+ cnode = (const que_common_t*) cnode->brother;
+ }
+
+ return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node) /*!< in: node */
+{
+ return(((que_common_t*) node)->parent);
+}
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
new file mode 100644
index 00000000..38f6e380
--- /dev/null
+++ b/storage/innobase/include/que0types.h
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+
+/* Pseudotype for all graph nodes */
+typedef void que_node_t;
+
+/* Query graph root is a fork node */
+typedef struct que_fork_t que_t;
+
+struct row_prebuilt_t;
+struct que_thr_t;
+
+/* Query graph node types */
+#define QUE_NODE_LOCK 1
+#define QUE_NODE_INSERT 2
+#define QUE_NODE_UPDATE 4
+#define QUE_NODE_CURSOR 5
+#define QUE_NODE_SELECT 6
+#define QUE_NODE_AGGREGATE 7
+#define QUE_NODE_FORK 8
+#define QUE_NODE_THR 9
+#define QUE_NODE_UNDO 10
+#define QUE_NODE_COMMIT 11
+#define QUE_NODE_ROLLBACK 12
+#define QUE_NODE_PURGE 13
+#define QUE_NODE_CREATE_TABLE 14
+#define QUE_NODE_CREATE_INDEX 15
+#define QUE_NODE_SYMBOL 16
+#define QUE_NODE_RES_WORD 17
+#define QUE_NODE_FUNC 18
+#define QUE_NODE_ORDER 19
+#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT 23
+#define QUE_NODE_FETCH 24
+#define QUE_NODE_OPEN 25
+#define QUE_NODE_COL_ASSIGNMENT 26
+#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN 28
+#define QUE_NODE_ROW_PRINTF 29
+#define QUE_NODE_ELSIF 30
+#define QUE_NODE_CALL 31
+#define QUE_NODE_EXIT 32
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_t{
+ ulint type; /*!< query node type */
+ que_node_t* parent; /*!< back pointer to parent node, or NULL */
+ que_node_t* brother;/* pointer to a possible brother node */
+ dfield_t val; /*!< evaluated value for an expression */
+ ulint val_buf_size;
+ /* buffer size for the evaluated value data,
+ if the buffer has been allocated dynamically:
+ if this field is != 0, and the node is a
+ symbol node or a function node, then we
+ have to free the data field in val
+ explicitly */
+
+ /** Constructor */
+ que_common_t(ulint type, que_node_t* parent) :
+ type(type), parent(parent), brother(NULL),
+ val(), val_buf_size(0)
+ {}
+};
+
+#endif
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
new file mode 100644
index 00000000..e002f1b7
--- /dev/null
+++ b/storage/innobase/include/read0types.h
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0mem.h"
+#include "trx0types.h"
+#include "srw_lock.h"
+#include <algorithm>
+
+/**
+ Read view lists the trx ids of those transactions for which a consistent read
+ should not see the modifications to the database.
+*/
+class ReadViewBase
+{
+ /**
+ The read should not see any transaction with trx id >= this value.
+ In other words, this is the "high water mark".
+ */
+ trx_id_t m_low_limit_id= 0;
+
+ /**
+ The read should see all trx ids which are strictly
+ smaller (<) than this value. In other words, this is the
+ low water mark".
+ */
+ trx_id_t m_up_limit_id;
+
+ /** Set of RW transactions that was active when this snapshot was taken */
+ trx_ids_t m_ids;
+
+ /**
+ The view does not need to see the undo logs for transactions whose
+ transaction number is strictly smaller (<) than this value: they can be
+ removed in purge if not needed by other views.
+ */
+ trx_id_t m_low_limit_no;
+
+protected:
+ bool empty() { return m_ids.empty(); }
+
+ /** @return the up limit id */
+ trx_id_t up_limit_id() const { return m_up_limit_id; }
+
+public:
+ /**
+ Append state from another view.
+
+ This method is used to find min(m_low_limit_no), min(m_low_limit_id) and
+ all transaction ids below min(m_low_limit_id). These values effectively
+ form oldest view.
+
+ @param other view to copy from
+ */
+ void append(const ReadViewBase &other)
+ {
+ ut_ad(&other != this);
+ if (m_low_limit_no > other.m_low_limit_no)
+ m_low_limit_no= other.m_low_limit_no;
+ if (m_low_limit_id > other.m_low_limit_id)
+ m_low_limit_id= other.m_low_limit_id;
+
+ trx_ids_t::iterator dst= m_ids.begin();
+ for (const trx_id_t id : other.m_ids)
+ {
+ if (id >= m_low_limit_id)
+ break;
+loop:
+ if (dst == m_ids.end())
+ {
+ m_ids.push_back(id);
+ dst= m_ids.end();
+ continue;
+ }
+ if (*dst < id)
+ {
+ dst++;
+ goto loop;
+ }
+ else if (*dst > id)
+ dst= m_ids.insert(dst, id) + 1;
+ }
+ m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id),
+ m_ids.end());
+
+ m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
+ ut_ad(m_up_limit_id <= m_low_limit_id);
+ }
+
+
+ /**
+ Creates a snapshot where exactly the transactions serialized before this
+ point in time are seen in the view.
+
+ @param[in,out] trx transaction
+ */
+ inline void snapshot(trx_t *trx);
+
+
+ /**
+ Check whether the changes by id are visible.
+ @param[in] id transaction id to check against the view
+ @return whether the view sees the modifications of id.
+ */
+ bool changes_visible(trx_id_t id) const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ if (id >= m_low_limit_id)
+ return false;
+ return id < m_up_limit_id ||
+ m_ids.empty() ||
+ !std::binary_search(m_ids.begin(), m_ids.end(), id);
+ }
+
+ /**
+ @param id transaction to check
+ @return true if view sees transaction id
+ */
+ bool sees(trx_id_t id) const { return id < m_up_limit_id; }
+
+ /** @return the low limit no */
+ trx_id_t low_limit_no() const { return m_low_limit_no; }
+
+ /** @return the low limit id */
+ trx_id_t low_limit_id() const { return m_low_limit_id; }
+
+ /** Clamp the low limit id for purge_sys.end_view */
+ void clamp_low_limit_id(trx_id_t limit)
+ {
+ if (m_low_limit_id > limit)
+ m_low_limit_id= limit;
+ }
+};
+
+
+/** A ReadView with extra members required for trx_t::read_view. */
+class ReadView: public ReadViewBase
+{
+ /**
+ View state.
+
+ Implemented as atomic to allow mutex-free view close and re-use.
+ Non-owner thread is allowed to call is_open() alone without mutex
+ protection as well. E.g. trx_sys.view_count() does this.
+
+ If non-owner thread intends to access other members as well, both
+ is_open() and other members accesses must be protected by m_mutex.
+ E.g. copy_to().
+ */
+ std::atomic<bool> m_open;
+
+ /** For synchronisation with purge coordinator. */
+ mutable srw_mutex m_mutex;
+
+ /**
+ trx id of creating transaction.
+ Used exclusively by the read view owner thread.
+ */
+ trx_id_t m_creator_trx_id;
+
+public:
+ ReadView()
+ {
+ memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+ m_mutex.init();
+ }
+ ~ReadView() { m_mutex.destroy(); }
+
+
+ /**
+ Opens a read view where exactly the transactions serialized before this
+ point in time are seen in the view.
+
+ View becomes visible to purge thread. Intended to be called by the ReadView
+ owner thread.
+
+ @param[in,out] trx transaction
+ */
+ void open(trx_t *trx);
+
+
+ /**
+ Closes the view.
+
+ View becomes not visible to purge thread. Intended to be called by the
+ ReadView owner thread.
+ */
+ void close() { m_open.store(false, std::memory_order_relaxed); }
+
+
+ /** Returns true if view is open. */
+ bool is_open() const { return m_open.load(std::memory_order_relaxed); }
+
+
+ /**
+ Sets the creator transaction id.
+
+ This should be set only for views created by RW transactions.
+ Intended to be called by the ReadView owner thread.
+ */
+ void set_creator_trx_id(trx_id_t id)
+ {
+ ut_ad(m_creator_trx_id == 0);
+ m_creator_trx_id= id;
+ }
+
+
+ /**
+ Writes the limits to the file.
+ @param file file to write to
+ */
+ void print_limits(FILE *file) const
+ {
+ m_mutex.wr_lock();
+ if (is_open())
+ fprintf(file, "Trx read view will not see trx with"
+ " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
+ low_limit_id(), up_limit_id());
+ m_mutex.wr_unlock();
+ }
+
+
+ /**
+ A wrapper around ReadViewBase::changes_visible().
+ Intended to be called by the ReadView owner thread.
+ */
+ bool changes_visible(trx_id_t id) const
+ { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }
+
+ /**
+ A wrapper around ReadViewBase::append().
+ Intended to be called by the purge coordinator task.
+ */
+ void append_to(ReadViewBase *to) const
+ {
+ m_mutex.wr_lock();
+ if (is_open())
+ to->append(*this);
+ m_mutex.wr_unlock();
+ }
+
+ /**
+ Declare the object mostly unaccessible.
+ */
+ void mem_noaccess() const
+ {
+ MEM_NOACCESS(&m_open, sizeof m_open);
+ /* m_mutex is accessed via trx_sys.rw_trx_hash */
+ MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
+ }
+};
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
new file mode 100644
index 00000000..3a30f5a9
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.h
@@ -0,0 +1,286 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#pragma once
+
+#include "data0data.h"
+#include "data0type.h"
+#include "rem0types.h"
+#include "page0types.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+ibool
+cmp_cols_are_equal(
+/*===============*/
+ const dict_col_t* col1, /*!< in: column 1 */
+ const dict_col_t* col2, /*!< in: column 2 */
+ ibool check_charsets);
+ /*!< in: whether to check charsets */
+/** Compare two data fields.
+@param mtype main type
+@param prtype precise type
+@param descending whether to use descending order
+@param data1 data field
+@param len1 length of data1 in bytes, or UNIV_SQL_NULL
+@param data2 data field
+@param len2 length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+int cmp_data(ulint mtype, ulint prtype, bool descending,
+ const byte *data1, size_t len1, const byte *data2, size_t len2)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Compare two data fields.
+@param dfield1 data field; must have type field set
+@param dfield2 data field
+@param descending whether to use descending order
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+inline int cmp_dfield_dfield(const dfield_t *dfield1, const dfield_t *dfield2,
+ bool descending= false)
+{
+ ut_ad(dfield_check_typed(dfield1));
+ const dtype_t *type= dfield_get_type(dfield1);
+ return cmp_data(type->mtype, type->prtype, descending,
+ static_cast<const byte*>(dfield_get_data(dfield1)),
+ dfield_get_len(dfield1),
+ static_cast<const byte*>(dfield_get_data(dfield2)),
+ dfield_get_len(dfield2));
+}
+
+#ifdef UNIV_DEBUG
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+ page_cur_mode_t mode)
+ MY_ATTRIBUTE((nonnull));
+#endif
+
+/** Compare two minimum bounding rectangles.
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+inline int cmp_geometry_field(const void *a, const void *b)
+{
+ const byte *mbr1= static_cast<const byte*>(a);
+ const byte *mbr2= static_cast<const byte*>(b);
+
+ static_assert(SPDIMS == 2, "compatibility");
+ static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+ /* Try to compare mbr left lower corner (xmin, ymin) */
+ double x1= mach_double_read(mbr1);
+ double x2= mach_double_read(mbr2);
+ if (x1 > x2)
+ return 1;
+ if (x1 < x2)
+ return -1;
+
+ x1= mach_double_read(mbr1 + sizeof(double) * SPDIMS);
+ x2= mach_double_read(mbr2 + sizeof(double) * SPDIMS);
+
+ if (x1 > x2)
+ return 1;
+ if (x1 < x2)
+ return -1;
+
+ /* left lower corner (xmin, ymin) overlaps, now right upper corner */
+ x1= mach_double_read(mbr1 + sizeof(double));
+ x2= mach_double_read(mbr2 + sizeof(double));
+
+ if (x1 > x2)
+ return 1;
+ if (x1 < x2)
+ return -1;
+
+ x1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double));
+ x2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double));
+
+ if (x1 > x2)
+ return 1;
+ if (x1 < x2)
+ return -1;
+
+ return 0;
+}
+
+/** Compare a data tuple to a physical record.
+@param dtuple data tuple
+@param rec B-tree index record
+@param index B-tree index
+@param offsets rec_get_offsets(rec,index)
+@param n_cmp number of fields to compare
+@param matched_fields number of completely matched fields
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec,
+ const dict_index_t *index,
+ const rec_offs *offsets,
+ ulint n_cmp, ulint *matched_fields)
+ MY_ATTRIBUTE((nonnull));
+#define cmp_dtuple_rec_with_match(tuple,rec,index,offsets,fields) \
+ cmp_dtuple_rec_with_match_low( \
+ tuple,rec,index,offsets,dtuple_get_n_fields_cmp(tuple),fields)
+/** Compare a data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec B-tree or R-tree index record
+@param[in] index index tree
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] matched_fields number of completely matched fields
+@param[in,out] matched_bytes number of matched bytes in the first
+field that is not matched
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_bytes(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint* matched_fields,
+ ulint* matched_bytes)
+ MY_ATTRIBUTE((warn_unused_result));
+/** Compare a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@param dtuple data tuple
+@param rec index record
+@param index index
+@param offsets rec_get_offsets(rec, index)
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+inline int cmp_dtuple_rec(const dtuple_t *dtuple, const rec_t *rec,
+ const dict_index_t *index, const rec_offs *offsets)
+{
+ ulint matched= 0;
+ return cmp_dtuple_rec_with_match(dtuple, rec, index, offsets, &matched);
+}
+
+/** Check if a dtuple is a prefix of a record.
+@param dtuple data tuple
+@param rec index record
+@param index index
+@param offsets rec_get_offsets(rec)
+@return whether dtuple is a prefix of rec */
+bool cmp_dtuple_is_prefix_of_rec(const dtuple_t *dtuple, const rec_t *rec,
+ const dict_index_t *index,
+ const rec_offs *offsets)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval positive if rec1 (including non-ordering columns) is greater than rec2
+@retval negative if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+int
+cmp_rec_rec_simple(
+/*===============*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const rec_offs* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const rec_offs* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index, /*!< in: data dictionary index */
+ struct TABLE* table) /*!< in: MySQL table, for reporting
+ duplicate key value if applicable,
+ or NULL */
+ MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result));
+
+/** Compare two B-tree or R-tree records.
+Only the common first fields are compared, and externally stored field
+are treated as equal.
+@param[in] rec1 record (possibly not on an index page)
+@param[in] rec2 B-tree or R-tree record in an index page
+@param[in] offsets1 rec_get_offsets(rec1, index)
+@param[in] offsets2 rec_get_offsets(rec2, index)
+@param[in] nulls_unequal true if this is for index cardinality
+ statistics estimation with
+ innodb_stats_method=nulls_unequal
+ or innodb_stats_method=nulls_ignored
+@param[out] matched_fields number of completely matched fields
+ within the first field not completely matched
+@retval 0 if rec1 is equal to rec2
+@retval negative if rec1 is less than rec2
+@retval positive if rec1 is greater than rec2 */
+int
+cmp_rec_rec(
+ const rec_t* rec1,
+ const rec_t* rec2,
+ const rec_offs* offsets1,
+ const rec_offs* offsets2,
+ const dict_index_t* index,
+ bool nulls_unequal = false,
+ ulint* matched_fields = NULL)
+ MY_ATTRIBUTE((nonnull(1,2,3,4,5)));
+
+/** Compare two data fields.
+@param dfield1 data field
+@param dfield2 data field
+@return the comparison result of dfield1 and dfield2
+@retval true if dfield1 is equal to dfield2, or a prefix of dfield1
+@retval false otherwise */
+inline bool cmp_dfield_dfield_eq_prefix(const dfield_t *dfield1,
+ const dfield_t *dfield2)
+{
+ ut_ad(dfield_check_typed(dfield1));
+ ut_ad(dfield_check_typed(dfield2));
+ const dtype_t *type= dfield_get_type(dfield1);
+
+#ifdef UNIV_DEBUG
+ switch (type->prtype & DATA_MYSQL_TYPE_MASK) {
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ uint cs_num= dtype_get_charset_coll(type->prtype);
+ CHARSET_INFO *cs= get_charset(cs_num, MYF(MY_WME));
+ ut_a(cs);
+ return !cs->strnncoll(static_cast<const uchar*>(dfield_get_data(dfield1)),
+ dfield_get_len(dfield1),
+ static_cast<const uchar*>(dfield_get_data(dfield2)),
+ dfield_get_len(dfield2), 1);
+}
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
new file mode 100644
index 00000000..2f038ab3
--- /dev/null
+++ b/storage/innobase/include/rem0rec.h
@@ -0,0 +1,1276 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#endif /*! UNIV_INNOCHECKSUM */
+#include <ostream>
+#include <sstream>
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES 6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES 5
+
+#define REC_NEW_STATUS 3 /* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK 0x7UL
+#define REC_NEW_STATUS_SHIFT 0
+
+/* The following four constants are needed in page0zip.cc in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO 4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define REC_HEAP_NO_SHIFT 3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE 4
+
+#ifndef UNIV_INNOCHECKSUM
+/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80;
+/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000;
+
+/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
+significant bit denotes that the tail of a field is stored off-page. */
+constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000;
+
+constexpr size_t RECORD_OFFSET= 2;
+constexpr size_t INDEX_OFFSET=
+ RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs);
+#endif /* UNIV_INNOCHECKSUM */
+
+/* Length of the rec_get_offsets() header */
+constexpr size_t REC_OFFS_HEADER_SIZE=
+#ifdef UNIV_DEBUG
+#ifndef UNIV_INNOCHECKSUM
+ sizeof(rec_t *) / sizeof(rec_offs) +
+ sizeof(dict_index_t *) / sizeof(rec_offs) +
+#endif /* UNIV_INNOCHECKSUM */
+#endif /* UNIV_DEBUG */
+ 2;
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+constexpr size_t REC_OFFS_NORMAL_SIZE= 300;
+constexpr size_t REC_OFFS_SMALL_SIZE= 18;
+constexpr size_t REC_OFFS_SEC_INDEX_SIZE=
+ /* PK max key parts */ 16 + /* sec idx max key parts */ 16 +
+ /* child page number for non-leaf pages */ 1;
+
+/** Get the base address of offsets. The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+#ifndef UNIV_INNOCHECKSUM
+/* Offset consists of two parts: 2 upper bits is type and all other bits is
+value */
+
+/** Only 4 different values is possible! */
+enum field_type_t
+{
+ /** normal field */
+ STORED_IN_RECORD= 0 << 14,
+ /** this field is stored off-page */
+ STORED_OFFPAGE= 1 << 14,
+ /** just an SQL NULL */
+ SQL_NULL= 2 << 14,
+ /** instantly added field */
+ DEFAULT= 3 << 14,
+};
+
+/** without 2 upper bits */
+static constexpr rec_offs DATA_MASK= 0x3fff;
+/** 2 upper bits */
+static constexpr rec_offs TYPE_MASK= ~DATA_MASK;
+inline field_type_t get_type(rec_offs n)
+{
+ return static_cast<field_type_t>(n & TYPE_MASK);
+}
+inline void set_type(rec_offs &n, field_type_t type)
+{
+ n= static_cast<rec_offs>((n & DATA_MASK) | type);
+}
+inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; }
+inline rec_offs combine(rec_offs value, field_type_t type)
+{
+ return static_cast<rec_offs>(get_value(value) | type);
+}
+
+/** Compact flag ORed to the extra size returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1));
+/** External flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
+/** Default value flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
+constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
+
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next) /*!< in: offset of the next record */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next) /*!< in: offset of the next record */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in] index index
+@param[in] rec record of the same page
+@param[in] entry index entry
+@return true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+ dict_index_t* index,
+ const rec_t* rec,
+ const dtuple_t* entry)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec) /*!< in: old-style physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec) /*!< in: new-style physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine the status bits of a non-REDUNDANT record.
+@param[in] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@return status bits */
+inline
+rec_comp_status_t
+rec_get_status(const rec_t* rec)
+{
+ byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK;
+ ut_ad(bits <= REC_STATUS_INSTANT);
+ return static_cast<rec_comp_status_t>(bits);
+}
+
+/** Set the status bits of a non-REDUNDANT record.
+@param[in,out] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@param[in] bits status bits */
+inline void rec_set_status(rec_t *rec, byte bits)
+{
+ ut_ad(bits <= REC_STATUS_INSTANT);
+ rec[-REC_NEW_STATUS]= static_cast<byte>((rec[-REC_NEW_STATUS] &
+ ~REC_NEW_STATUS_MASK) | bits);
+}
+
+/** Get the length of added field count in a REC_STATUS_INSTANT record.
+@param[in] n_add_field number of added fields, minus one
+@return storage size of the field count, in bytes */
+inline unsigned rec_get_n_add_field_len(ulint n_add_field)
+{
+ ut_ad(n_add_field < REC_MAX_N_FIELDS);
+ return n_add_field < 0x80 ? 1 : 2;
+}
+
+/** Get the added field count in a REC_STATUS_INSTANT record.
+@param[in,out] header variable header of a REC_STATUS_INSTANT record
+@return number of added fields */
+inline unsigned rec_get_n_add_field(const byte*& header)
+{
+ unsigned n_fields_add = *--header;
+ if (n_fields_add < 0x80) {
+ ut_ad(rec_get_n_add_field_len(n_fields_add) == 1);
+ return n_fields_add;
+ }
+
+ n_fields_add &= 0x7f;
+ n_fields_add |= unsigned(*--header) << 7;
+ ut_ad(n_fields_add < REC_MAX_N_FIELDS);
+ ut_ad(rec_get_n_add_field_len(n_fields_add) == 2);
+ return n_fields_add;
+}
+
+/** Set the added field count in a REC_STATUS_INSTANT record.
+@param[in,out] header variable header of a REC_STATUS_INSTANT record
+@param[in] n_add number of added fields, minus 1
+@return record header before the number of added fields */
+inline void rec_set_n_add_field(byte*& header, ulint n_add)
+{
+ ut_ad(n_add < REC_MAX_N_FIELDS);
+
+ if (n_add < 0x80) {
+ *header-- = byte(n_add);
+ } else {
+ *header-- = byte(byte(n_add) | 0x80);
+ *header-- = byte(n_add >> 7);
+ }
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: compact physical record */
+ ulint bits) /*!< in: info bits */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /*!< in: physical record */
+ ibool flag) /*!< in: TRUE if 1byte form */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+ulint
+rec_get_n_extern_new(
+/*=================*/
+ const rec_t* rec, /*!< in: compact physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n) /*!< in: number of columns to scan */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine the offsets to each field in an index record.
+@param[in] rec physical record
+@param[in] index the index that the record belongs to
+@param[in,out] offsets array comprising offsets[0] allocated elements,
+ or an array from rec_get_offsets(), or NULL
+@param[in] n_core 0, or index->n_core_fields for leaf page
+@param[in] n_fields maximum number of offsets to compute
+ (ULINT_UNDEFINED to compute all offsets)
+@param[in,out] heap memory heap
+@return the new offsets */
+rec_offs*
+rec_get_offsets_func(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ ulint n_fields,
+#ifdef UNIV_DEBUG
+ const char* file, /*!< in: file name where called */
+ unsigned line, /*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+ mem_heap_t** heap) /*!< in/out: memory heap */
+#ifdef UNIV_DEBUG
+ MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result));
+#else /* UNIV_DEBUG */
+ MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \
+ rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap)
+#else /* UNIV_DEBUG */
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \
+ rec_get_offsets_func(rec, index, offsets, leaf, n, heap)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array. */
+void
+rec_get_offsets_reverse(
+/*====================*/
+ const byte* extra, /*!< in: the extra bytes of a
+ compact record in reverse order,
+ excluding the fixed-size
+ REC_N_NEW_EXTRA_BYTES */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint node_ptr,/*!< in: nonzero=node pointer,
+ 0=leaf node */
+ rec_offs* offsets)/*!< in/out: array consisting of
+ offsets[0] allocated elements */
+ MY_ATTRIBUTE((nonnull));
+#ifdef UNIV_DEBUG
+/** Validate offsets returned by rec_get_offsets().
+@param[in] rec record, or NULL
+@param[in] index the index that the record belongs in, or NULL
+@param[in,out] offsets the offsets of the record
+@return true */
+bool
+rec_offs_validate(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets)
+ MY_ATTRIBUTE((nonnull(3), warn_unused_result));
+/** Update debug data in offsets, in order to tame rec_offs_validate().
+@param[in] rec record
+@param[in] index the index that the record belongs in
+@param[in] leaf whether the record resides in a leaf page
+@param[in,out] offsets offsets from rec_get_offsets() to adjust */
+void
+rec_offs_make_valid(
+ const rec_t* rec,
+ const dict_index_t* index,
+ bool leaf,
+ rec_offs* offsets)
+ MY_ATTRIBUTE((nonnull));
+#else
+# define rec_offs_make_valid(rec, index, leaf, offsets)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+ MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+ MY_ATTRIBUTE((warn_unused_result));
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+ MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Mark the nth field as externally stored.
+@param[in] offsets array returned by rec_get_offsets()
+@param[in] n nth field */
+void
+rec_offs_make_nth_extern(
+ rec_offs* offsets,
+ const ulint n);
+
+MY_ATTRIBUTE((nonnull))
+/** Determine the number of allocated elements for an array of offsets.
+@param[in] offsets offsets after rec_offs_set_n_alloc()
+@return number of elements */
+inline ulint rec_offs_get_n_alloc(const rec_offs *offsets)
+{
+ ut_ad(offsets);
+ ulint n_alloc= offsets[0];
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets);
+ return n_alloc;
+}
+
+/** Determine the number of fields for which offsets have been initialized.
+@param[in] offsets rec_get_offsets()
+@return number of fields */
+inline
+ulint
+rec_offs_n_fields(const rec_offs* offsets)
+{
+ ulint n_fields;
+ ut_ad(offsets);
+ n_fields = offsets[1];
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ return(n_fields);
+}
+
+/** Get a flag of a record field.
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+@param[in] flag flag to extract
+@return type of the record field */
+inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n)
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ return get_type(rec_offs_base(offsets)[1 + n]);
+}
+
+/** Determine if a record field is missing
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+@return nonzero if default bit is set */
+inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n)
+{
+ return rec_offs_nth_type(offsets, n) == DEFAULT;
+}
+
+/** Determine if a record field is SQL NULL
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+@return nonzero if SQL NULL set */
+inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n)
+{
+ return rec_offs_nth_type(offsets, n) == SQL_NULL;
+}
+
+/** Determine if a record field is stored off-page.
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n)
+{
+ return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE;
+}
+
+/** Get a global flag of a record.
+@param[in] offsets rec_get_offsets()
+@param[in] flag flag to extract
+@return the flag of the record field */
+inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag)
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return *rec_offs_base(offsets) & flag;
+}
+
+/** Determine if the offsets are for a record containing off-page columns.
+@param[in] offsets rec_get_offsets()
+@return nonzero if any off-page columns exist */
+inline bool rec_offs_any_extern(const rec_offs *offsets)
+{
+ return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL);
+}
+
+/** Determine if the offsets are for a record that is missing fields.
+@param[in] offsets rec_get_offsets()
+@return nonzero if any fields need to be replaced with
+ dict_index_t::instant_field_value() */
+inline ulint rec_offs_any_default(const rec_offs *offsets)
+{
+ return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT);
+}
+
+/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT.
+@param[in] offsets rec_get_offsets()
+@return nonzero if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED
+@retval 0 if ROW_FORMAT=REDUNDANT */
+inline ulint rec_offs_comp(const rec_offs *offsets)
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return (*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in] rec leaf page record
+@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t* rec, ulint comp)
+{
+ bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG);
+ ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in] rec leaf page record
+@param[in] index index of the record
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t *rec, const dict_index_t &index)
+{
+ return rec_is_metadata(rec, index.table->not_redundant());
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in] rec leaf page record
+@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, ulint comp)
+{
+ bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG;
+ ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in] rec leaf page record
+@param[in] index index of the record
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index)
+{
+ bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table));
+ ut_ad(!is || index.is_instant());
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in] rec leaf page record
+@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp)
+{
+ bool is = !(~rec_get_info_bits(rec, comp)
+ & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG));
+ ut_ad(!is || rec_is_metadata(rec, comp));
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in] rec leaf page record
+@param[in] index index of the record
+@return whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index)
+{
+ bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table));
+ ut_ad(!is || index.is_dummy || index.is_instant());
+ return is;
+}
+
+/** Determine if a record is delete-marked (not a metadata pseudo-record).
+@param[in] rec record
+@param[in] comp nonzero if ROW_FORMAT!=REDUNDANT
+@return whether the record is a delete-marked user record */
+inline bool rec_is_delete_marked(const rec_t* rec, ulint comp)
+{
+ return (rec_get_info_bits(rec, comp)
+ & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))
+ == REC_INFO_DELETED_FLAG;
+}
+
+/** Get the nth field from an index.
+@param[in] rec index record
+@param[in] index index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] n field number
+@param[out] len length of the field in bytes, or UNIV_SQL_NULL
+@return a read-only copy of the index field */
+inline
+const byte*
+rec_get_nth_cfield(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint n,
+ ulint* len)
+{
+ /* Because this function may be invoked by innobase_rec_to_mysql()
+ for reporting a duplicate key during ALTER TABLE or
+ CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size
+ header of 5 or 6 bytes, the check
+ rec_offs_validate(rec, index, offsets) must be avoided here. */
+ if (!rec_offs_nth_default(offsets, n)) {
+ return rec_get_nth_field(rec, offsets, n, len);
+ }
+ return index->instant_field_value(n, len);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ rec_offs*offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc) /*!< in: number of elements */
+ MY_ATTRIBUTE((nonnull));
+#define rec_offs_init(offsets) \
+ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+#else /* UNIV_DEBUG */
+# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets))
+# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets))
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in] buf buffer
+@param[in] rec physical record
+@param[in] offsets array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+ void* buf,
+ const rec_t* rec,
+ const rec_offs* offsets);
+
+/** Determine the size of a data tuple prefix in a temporary file.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in] index clustered or secondary index
+@param[in] fields data fields
+@param[in] n_fields number of data fields
+@param[out] extra record header size
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+@return total size, in bytes */
+template<bool redundant_temp>
+ulint
+rec_get_converted_size_temp(
+ const dict_index_t* index,
+ const dfield_t* fields,
+ ulint n_fields,
+ ulint* extra,
+ rec_comp_status_t status = REC_STATUS_ORDINARY)
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Determine the offset to each field in temporary file.
+@param[in] rec temporary file record
+@param[in] index index of that the record belongs to
+@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets)
+@param[in] n_core number of core fields (index->n_core_fields)
+@param[in] def_val default values for non-core fields
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+void
+rec_init_offsets_temp(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ const dict_col_t::def_t*def_val,
+ rec_comp_status_t status = REC_STATUS_ORDINARY)
+ MY_ATTRIBUTE((nonnull(1,2,3)));
+/** Determine the offset to each field in temporary file.
+@param[in] rec temporary file record
+@param[in] index index of that the record belongs to
+@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets)
+*/
+void
+rec_init_offsets_temp(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets)
+ MY_ATTRIBUTE((nonnull));
+
+/** Convert a data tuple prefix to the temporary file format.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[out] rec record in temporary file format
+@param[in] index clustered or secondary index
+@param[in] fields data fields
+@param[in] n_fields number of data fields
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+template<bool redundant_temp>
+void
+rec_convert_dtuple_to_temp(
+ rec_t* rec,
+ const dict_index_t* index,
+ const dfield_t* fields,
+ ulint n_fields,
+ rec_comp_status_t status = REC_STATUS_ORDINARY)
+ MY_ATTRIBUTE((nonnull));
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ byte** buf, /*!< in/out: memory buffer
+ for the copied prefix,
+ or NULL */
+ ulint* buf_size) /*!< in/out: buffer size */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return pointer to the origin of physical record */
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ byte* buf, /*!< in: start address of the
+ physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of
+ externally stored columns */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ MY_ATTRIBUTE((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra) /*!< out: extra size */
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1,2)));
+
+/** Determine the size of a record in ROW_FORMAT=COMPACT.
+@param[in] index record descriptor. dict_table_is_comp()
+ is assumed to hold, even if it doesn't
+@param[in] tuple logical record
+@param[out] extra extra size
+@return total size */
+ulint
+rec_get_converted_size_comp(
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* extra)
+ MY_ATTRIBUTE((nonnull(1,2)));
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Copy the first n fields of a (copy of a) physical record to a data tuple.
+The fields are copied into the memory heap.
+@param[out] tuple data tuple
+@param[in] rec index record, or a copy thereof
+@param[in] index index of rec
+@param[in] n_core index->n_core_fields at the time rec was
+ copied, or 0 if non-leaf page record
+@param[in] n_fields number of fields to copy
+@param[in,out] heap memory heap */
+void
+rec_copy_prefix_to_dtuple(
+ dtuple_t* tuple,
+ const rec_t* rec,
+ const dict_index_t* index,
+ ulint n_core,
+ ulint n_fields,
+ mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+ibool
+rec_validate(
+/*=========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints an old-style physical record. */
+void
+rec_print_old(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a spatial index record. */
+void
+rec_print_mbr_rec(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print_new(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print(
+/*======*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+ MY_ATTRIBUTE((nonnull));
+
+/** Pretty-print a record.
+@param[in,out] o output stream
+@param[in] rec physical record
+@param[in] info rec_get_info_bits(rec)
+@param[in] offsets rec_get_offsets(rec) */
+void
+rec_print(
+ std::ostream& o,
+ const rec_t* rec,
+ ulint info,
+ const rec_offs* offsets);
+
+/** Wrapper for pretty-printing a record */
+struct rec_index_print
+{
+ /** Constructor */
+ rec_index_print(const rec_t* rec, const dict_index_t* index) :
+ m_rec(rec), m_index(index)
+ {}
+
+ /** Record */
+ const rec_t* m_rec;
+ /** Index */
+ const dict_index_t* m_index;
+};
+
+/** Display a record.
+@param[in,out] o output stream
+@param[in] r record to display
+@return the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_index_print& r);
+
+/** Wrapper for pretty-printing a record */
+struct rec_offsets_print
+{
+ /** Constructor */
+ rec_offsets_print(const rec_t* rec, const rec_offs* offsets) :
+ m_rec(rec), m_offsets(offsets)
+ {}
+
+ /** Record */
+ const rec_t* m_rec;
+ /** Offsets to each field */
+ const rec_offs* m_offsets;
+};
+
+/** Display a record.
+@param[in,out] o output stream
+@param[in] r record to display
+@return the output stream */
+ATTRIBUTE_COLD
+std::ostream&
+operator<<(std::ostream& o, const rec_offsets_print& r);
+
+/** Pretty-printer of records and tuples */
+class rec_printer : public std::ostringstream {
+public:
+ /** Construct a pretty-printed record.
+ @param rec record with header
+ @param offsets rec_get_offsets(rec, ...) */
+ ATTRIBUTE_COLD
+ rec_printer(const rec_t* rec, const rec_offs* offsets)
+ :
+ std::ostringstream ()
+ {
+ rec_print(*this, rec,
+ rec_get_info_bits(rec, rec_offs_comp(offsets)),
+ offsets);
+ }
+
+ /** Construct a pretty-printed record.
+ @param rec record, possibly lacking header
+ @param info rec_get_info_bits(rec)
+ @param offsets rec_get_offsets(rec, ...) */
+ ATTRIBUTE_COLD
+ rec_printer(const rec_t* rec, ulint info, const rec_offs* offsets)
+ :
+ std::ostringstream ()
+ {
+ rec_print(*this, rec, info, offsets);
+ }
+
+ /** Construct a pretty-printed tuple.
+ @param tuple data tuple */
+ ATTRIBUTE_COLD
+ rec_printer(const dtuple_t* tuple)
+ :
+ std::ostringstream ()
+ {
+ dtuple_print(*this, tuple);
+ }
+
+ /** Construct a pretty-printed tuple.
+ @param field array of data tuple fields
+ @param n number of fields */
+ ATTRIBUTE_COLD
+ rec_printer(const dfield_t* field, ulint n)
+ :
+ std::ostringstream ()
+ {
+ dfield_print(*this, field, n);
+ }
+
+ /** Destructor */
+ ~rec_printer() override = default;
+
+private:
+ /** Copy constructor */
+ rec_printer(const rec_printer& other);
+ /** Assignment operator */
+ rec_printer& operator=(const rec_printer& other);
+};
+
+
+# ifdef UNIV_DEBUG
+/** Read the DB_TRX_ID of a clustered index record.
+@param[in] rec clustered index record
+@param[in] index clustered index
+@return the value of DB_TRX_ID */
+trx_id_t
+rec_get_trx_id(
+ const rec_t* rec,
+ const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT 0x7FUL
+#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL
+
+/* The data size of record must not be larger than this on
+REDUNDANT row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define REDUNDANT_REC_MAX_DATA_SIZE (16383)
+
+/* The data size of record must be smaller than this on
+COMPRESSED row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define COMPRESSED_REC_MAX_DATA_SIZE (16384)
+
+#ifdef WITH_WSREP
+int wsrep_rec_get_foreign_key(
+ byte *buf, /* out: extracted key */
+ ulint *buf_len, /* in/out: length of buf */
+ const rec_t* rec, /* in: physical record */
+ dict_index_t* index_for, /* in: index for foreign table */
+ dict_index_t* index_ref, /* in: index for referenced table */
+ ibool new_protocol); /* in: protocol > 1 */
+#endif /* WITH_WSREP */
+
+#include "rem0rec.inl"
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* rem0rec_h */
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
new file mode 100644
index 00000000..46c209cb
--- /dev/null
+++ b/storage/innobase/include/rem0rec.inl
@@ -0,0 +1,1134 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0boot.h"
+#include "btr0types.h"
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits pointer to next record
+ 2 8 bits pointer to next record
+ 3 1 bit short flag
+ 7 bits number of fields
+ 4 3 bits number of fields
+ 5 bits heap number
+ 5 8 bits heap number
+ 6 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits relative offset of next record
+ 2 8 bits relative offset of next record
+ the relative offset is an unsigned 16-bit
+ integer:
+ (offset_of_next_record
+ - offset_of_this_record) mod 64Ki,
+ where mod is the modulo as a non-negative
+ number;
+ we can calculate the offset of the next
+ record with the formula:
+ relative_offset + offset_of_this_record
+ mod srv_page_size
+ 3 3 bits status:
+ 000=REC_STATUS_ORDINARY
+ 001=REC_STATUS_NODE_PTR
+ 010=REC_STATUS_INFIMUM
+ 011=REC_STATUS_SUPREMUM
+ 100=REC_STATUS_INSTANT
+ 1xx=reserved
+ 5 bits heap number
+ 4 8 bits heap number
+ 5 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT 2
+#define REC_NEXT_MASK 0xFFFFUL
+#define REC_NEXT_SHIFT 0
+
+#define REC_OLD_SHORT 3 /* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK 0x1UL
+#define REC_OLD_SHORT_SHIFT 0
+
+#define REC_OLD_N_FIELDS 4
+#define REC_OLD_N_FIELDS_MASK 0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT 1
+
+#define REC_OLD_HEAP_NO 5
+#define REC_HEAP_NO_MASK 0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
+#define REC_NEW_HEAP_NO 4
+#define REC_HEAP_NO_SHIFT 3
+#endif
+
+#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */
+#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */
+#define REC_N_OWNED_MASK 0xFUL
+#define REC_N_OWNED_SHIFT 0
+
+#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */
+#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */
+#define REC_INFO_BITS_MASK 0xF0UL
+#define REC_INFO_BITS_SHIFT 0
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+ ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+ ^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+ ^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+byte
+rec_get_bit_field_1(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ return static_cast<byte>((*(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask);
+ ut_ad(mask <= 0xFFUL);
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_1(rec - offs,
+ (mach_read_from_1(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask > 0xFFUL);
+ ut_ad(mask <= 0xFFFFUL);
+ ut_ad((mask >> shift) & 1);
+ ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_2(rec - offs,
+ (mach_read_from_2(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+ compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+ compile_time_assert(REC_NEXT_SHIFT == 0);
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < srv_page_size
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, srv_page_size)
+ < srv_page_size);
+#endif
+ if (field_value == 0) {
+
+ return(0);
+ }
+
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return(ut_align_offset(rec + field_value, srv_page_size));
+ } else {
+ ut_ad(field_value < srv_page_size);
+
+ return(field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ut_ad(srv_page_size > next);
+ compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+ compile_time_assert(REC_NEXT_SHIFT == 0);
+ mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ulint field_value;
+
+ ut_ad(srv_page_size > next);
+
+ if (!next) {
+ field_value = 0;
+ } else {
+ /* The following two statements calculate
+ next - offset_of_rec mod 64Ki, where mod is the modulo
+ as a non-negative number */
+
+ field_value = (ulint)
+ ((lint) next
+ - (lint) ut_align_offset(rec, srv_page_size));
+ field_value &= REC_NEXT_MASK;
+ }
+
+ mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK,
+ REC_OLD_N_FIELDS_SHIFT);
+ ut_ad(ret <= REC_MAX_N_FIELDS);
+ ut_ad(ret > 0);
+
+ return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint n_fields) /*!< in: the number of fields */
+{
+ ut_ad(rec);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields > 0);
+
+ rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ ut_ad(rec);
+ ut_ad(index);
+
+ if (!dict_table_is_comp(index->table)) {
+ return(rec_get_n_fields_old(rec));
+ }
+
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ return(dict_index_get_n_fields(index));
+ case REC_STATUS_NODE_PTR:
+ return(dict_index_get_n_unique_in_tree(index) + 1);
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ return(1);
+ }
+
+ ut_error;
+ return(ULINT_UNDEFINED);
+}
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in] index index
+@param[in] rec record of the same page
+@param[in] entry index entry
+@return true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+ dict_index_t* index,
+ const rec_t* rec,
+ const dtuple_t* entry)
+{
+ const ulint n_fields = rec_get_n_fields(rec, index);
+
+ return(n_fields == dtuple_get_n_fields(entry)
+ || (index->is_instant()
+ && n_fields >= index->n_core_fields)
+ /* a record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ || (index->table->id == DICT_INDEXES_ID
+ && n_fields == dtuple_get_n_fields(entry) - 1));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec) /*!< in: old-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec) /*!< in: new-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return rec_get_bit_field_1(
+ rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+ & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+ if (comp)
+ return static_cast<byte>(rec_get_info_bits(rec, TRUE) |
+ rec_get_status(rec));
+ else
+ return rec_get_info_bits(rec, FALSE);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits) /*!< in: info bits */
+{
+ compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+ & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+ rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+ rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK,
+ REC_NEW_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ if (comp) {
+ return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT));
+ } else {
+ return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT));
+ }
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /*!< in: physical record */
+ ibool flag) /*!< in: TRUE if 1byte form */
+{
+ ut_ad(flag <= 1);
+
+ rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ rec_offs*offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc) /*!< in: number of elements */
+{
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ MEM_UNDEFINED(offsets, n_alloc * sizeof *offsets);
+ offsets[0] = static_cast<rec_offs>(n_alloc);
+}
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null; UNIV_SQL_DEFAULT is default value */
+{
+ ut_ad(n < rec_offs_n_fields(offsets));
+
+ rec_offs offs = n == 0 ? 0 : get_value(rec_offs_base(offsets)[n]);
+ rec_offs next_offs = rec_offs_base(offsets)[1 + n];
+
+ if (get_type(next_offs) == SQL_NULL) {
+ *len = UNIV_SQL_NULL;
+ } else if (get_type(next_offs) == DEFAULT) {
+ *len = UNIV_SQL_DEFAULT;
+ } else {
+ *len = get_value(next_offs) - offs;
+ }
+
+ return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */
+{
+ ulint i;
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(NULL);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field
+ = rec_get_nth_field(rec, offsets, i, &len);
+
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ if (!memcmp(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ return(field);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ if (!n) {
+ return get_value(rec_offs_base(offsets)[1 + n]);
+ }
+ return get_value((rec_offs_base(offsets)[1 + n]))
+ - get_value(rec_offs_base(offsets)[n]);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n = 0;
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ n++;
+ }
+ }
+ }
+
+ return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_1_get_prev_field_end_info(rec, n)
+ & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_2_get_prev_field_end_info(rec, n)
+ & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec);
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(rec_1_get_field_start_offs(rec, n));
+ }
+
+ return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+{
+ ulint os;
+ ulint next_os;
+
+ os = rec_get_field_start_offs(rec, n);
+ next_os = rec_get_field_start_offs(rec, n + 1);
+
+ ut_ad(next_os - os < srv_page_size);
+
+ return(next_os - os);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ut_ad(rec);
+
+ return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+ rec_offs* offsets, /*!< in/out: array returned by
+ rec_get_offsets() */
+ ulint n_fields) /*!< in: number of fields */
+{
+ ut_ad(offsets);
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ offsets[1] = static_cast<rec_offs>(n_fields);
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]);
+ ut_ad(size < srv_page_size);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = *rec_offs_base(offsets) & REC_OFFS_MASK;
+ ut_ad(size < srv_page_size);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets)));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets)));
+}
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in] buf buffer
+@param[in] rec physical record
+@param[in] offsets array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+ void* buf,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ ulint extra_len;
+ ulint data_len;
+
+ ut_ad(rec != NULL);
+ ut_ad(buf != NULL);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+
+ extra_len = rec_offs_extra_size(offsets);
+ data_len = rec_offs_data_size(offsets);
+
+ memcpy(buf, rec - extra_len, extra_len + data_len);
+
+ return((byte*) buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ return(REC_N_OLD_EXTRA_BYTES + n_fields);
+ }
+
+ return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ ulint data_size;
+ ulint extra_size;
+
+ ut_ad(dtuple_check_typed(dtuple));
+#ifdef UNIV_DEBUG
+ if (dict_index_is_ibuf(index)) {
+ ut_ad(dtuple->n_fields > 1);
+ } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+ == REC_STATUS_NODE_PTR) {
+ ut_ad(dtuple->n_fields - 1
+ == dict_index_get_n_unique_in_tree_nonleaf(index));
+ } else if (index->table->id == DICT_INDEXES_ID) {
+ /* The column SYS_INDEXES.MERGE_THRESHOLD was
+ instantly added in MariaDB 10.2.2 (MySQL 5.7). */
+ ut_ad(!index->table->is_temporary());
+ ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES);
+ ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES
+ || dtuple->n_fields
+ == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD);
+ } else {
+ ut_ad(dtuple->n_fields >= index->n_core_fields);
+ ut_ad(dtuple->n_fields <= index->n_fields
+ || dtuple->is_alter_metadata());
+ }
+#endif
+
+ if (dict_table_is_comp(index->table)) {
+ return rec_get_converted_size_comp(index, dtuple, NULL);
+ }
+
+ data_size = dtuple_get_data_size(dtuple, 0);
+
+ /* If primary key is being updated then the new record inherits
+ externally stored fields from the delete-marked old record.
+ In that case, n_ext may be less value than
+ dtuple_get_n_ext(tuple). */
+ ut_ad(n_ext <= dtuple_get_n_ext(dtuple));
+ extra_size = rec_get_converted_extra_size(
+ data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+ return(data_size + extra_size);
+}
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
new file mode 100644
index 00000000..0e4075a9
--- /dev/null
+++ b/storage/innobase/include/rem0types.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte rec_t;
+
+/** This type represents a field offset in a rec_t* */
+typedef unsigned short int rec_offs;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS (1024 - 1)
+#define REC_MAX_HEAP_NO (2 * 8192 - 1)
+#define REC_MAX_N_OWNED (16 - 1)
+
+/* Maximum number of user defined fields/columns. The reserved columns
+are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
+Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index()
+created a dummy table object possibly, with some of the system columns
+in it, and then adds the 3 system columns (again) using
+dict_table_add_system_columns().
+For now, we will keep this limitation to maintain file format compatibility
+with older versions. */
+#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
+
+/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed field length (or indexed prefix length) for indexes on tables of
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format.
+Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character
+may take at most 3 bytes. So the limit was set to 3*256, so that one
+can create a column prefix index on 256 characters of a TEXT or VARCHAR
+column also in the UTF-8 charset.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_ANTELOPE_MAX_INDEX_COL_LEN 768
+
+/** Maximum indexed field length for tables that have atomic BLOBs.
+This (3072) is the maximum index row length allowed, so we cannot create index
+prefix column longer than that. */
+#define REC_VERSION_56_MAX_INDEX_COL_LEN 3072
+
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+ REC_FORMAT_REDUNDANT = 0, /*!< REDUNDANT row format */
+ REC_FORMAT_COMPACT = 1, /*!< COMPACT row format */
+ REC_FORMAT_COMPRESSED = 2, /*!< COMPRESSED row format */
+ REC_FORMAT_DYNAMIC = 3 /*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
+#endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
new file mode 100644
index 00000000..78886332
--- /dev/null
+++ b/storage/innobase/include/row0ext.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "data0types.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "fsp0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dict_table_t& table, /*!< in: table */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ mem_heap_t* heap); /*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most the length determined by
+ DICT_MAX_FIELD_LEN_BY_FORMAT() */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most the length determined by
+ DICT_MAX_FIELD_LEN_BY_FORMAT() */
+
+/** Prefixes of externally stored columns */
+struct row_ext_t{
+ ulint n_ext; /*!< number of externally stored columns */
+ const ulint* ext; /*!< col_no's of externally stored columns */
+ byte* buf; /*!< backing store of the column prefix cache */
+ ulint max_len;/*!< maximum prefix length, it could be
+ REC_ANTELOPE_MAX_INDEX_COL_LEN or
+ REC_VERSION_56_MAX_INDEX_COL_LEN depending
+ on row format */
+ ulint zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */
+ ulint len[1]; /*!< prefix lengths; 0 if not cached */
+};
+
+#include "row0ext.inl"
+
+#endif
diff --git a/storage/innobase/include/row0ext.inl b/storage/innobase/include/row0ext.inl
new file mode 100644
index 00000000..913b51b3
--- /dev/null
+++ b/storage/innobase/include/row0ext.inl
@@ -0,0 +1,87 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most ext->max_len */
+{
+ ut_ad(ext);
+ ut_ad(len);
+ ut_ad(i < ext->n_ext);
+
+ *len = ext->len[i];
+
+ ut_ad(*len <= ext->max_len);
+ ut_ad(ext->max_len > 0);
+
+ if (*len == 0) {
+ /* The BLOB could not be fetched to the cache. */
+ return(field_ref_zero);
+ } else {
+ return(ext->buf + i * ext->max_len);
+ }
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most ext->max_len */
+{
+ ulint i;
+
+ ut_ad(ext);
+ ut_ad(len);
+
+ for (i = 0; i < ext->n_ext; i++) {
+ if (col == ext->ext[i]) {
+ return(row_ext_lookup_ith(ext, i, len));
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
new file mode 100644
index 00000000..3ffa8243
--- /dev/null
+++ b/storage/innobase/include/row0ftsort.h
@@ -0,0 +1,268 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "data0data.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "rem0types.h"
+#include "row0merge.h"
+#include "btr0bulk.h"
+#include "srv0srv.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+ dfield_t* field; /*!< field contains document string */
+ doc_id_t doc_id; /*!< document ID */
+ UT_LIST_NODE_T(fts_doc_item_t) doc_list;
+ /*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t;
+
+#define FTS_PLL_MERGE 1
+
+/** Sort information passed to each individual parallel sort thread */
+struct fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_t {
+ row_merge_dup_t* dup; /*!< descriptor of FTS index */
+ dict_table_t* new_table; /*!< source table */
+ /** Old table page size */
+ ulint old_zip_size;
+ trx_t* trx; /*!< transaction */
+ fts_psort_t* all_info; /*!< all parallel sort info */
+ pthread_cond_t sort_cond; /*!< sort completion */
+ ibool opt_doc_id_size;/*!< whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort, if
+ Doc ID will not be big enough
+ to use 8 bytes value */
+};
+
+struct fts_psort_t {
+ ulint psort_id; /*!< Parallel sort ID */
+ row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX];
+ /*!< sort buffer */
+ merge_file_t* merge_file[FTS_NUM_AUX_INDEX];
+ /*!< sort file */
+ row_merge_block_t* merge_block[FTS_NUM_AUX_INDEX];
+ /*!< buffer to write to file */
+ row_merge_block_t* crypt_block[FTS_NUM_AUX_INDEX];
+ /*!< buffer to crypt data */
+ ulint child_status; /*!< child task status */
+ ulint state; /*!< parent state */
+ fts_doc_list_t fts_doc_list; /*!< doc list to process */
+ fts_psort_common_t* psort_common; /*!< ptr to all psort info */
+ tpool::waitable_task* task; /*!< threadpool task */
+ dberr_t error; /*!< db error during psort */
+ ulint memory_used; /*!< memory used by fts_doc_list */
+ mysql_mutex_t mutex; /*!< mutex for fts_doc_list */
+};
+
+/** Row fts token for plugin parser */
+struct row_fts_token_t {
+ fts_string_t* text; /*!< token */
+ UT_LIST_NODE_T(row_fts_token_t)
+ token_list; /*!< next token link */
+};
+
+typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t;
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+ /** the processed string length in bytes
+ (when using the built-in tokenizer),
+ or the number of row_merge_fts_doc_tokenize_by_parser() calls */
+ ulint processed_len;
+ ulint init_pos; /*!< doc start position */
+ ulint buf_used; /*!< the sort buffer (ID) when
+ tokenization stops, which
+ could due to sort buffer full */
+ ulint rows_added[FTS_NUM_AUX_INDEX];
+ /*!< number of rows added for
+ each FTS index partition */
+ ib_rbt_t* cached_stopword;/*!< in: stopword list */
+ dfield_t sort_field[FTS_NUM_FIELDS_SORT];
+ /*!< in: sort field */
+ /** parsed tokens (when using an external parser) */
+ fts_token_list_t fts_token_list;
+
+ fts_tokenize_ctx() :
+ processed_len(0), init_pos(0), buf_used(0),
+ rows_added(), cached_stopword(NULL), sort_field(),
+ fts_token_list()
+ {
+ memset(rows_added, 0, sizeof rows_added);
+ memset(sort_field, 0, sizeof sort_field);
+ UT_LIST_INIT(fts_token_list, &row_fts_token_t::token_list);
+ }
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+ CHARSET_INFO* charset; /*!< charset info */
+ mem_heap_t* heap; /*!< heap */
+ ibool opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+ integer for Doc ID */
+ BtrBulk* btr_bulk; /*!< Bulk load instance */
+ dtuple_t* tuple; /*!< Tuple to insert */
+
+#ifdef UNIV_DEBUG
+ ulint aux_index_id; /*!< Auxiliary index id */
+#endif
+};
+
+typedef struct fts_psort_insert fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE 1
+#define FTS_PARENT_EXITING 2
+#define FTS_CHILD_COMPLETE 1
+
+/** Print some debug information */
+#define FTSORT_PRINT
+
+#ifdef FTSORT_PRINT
+#define DEBUG_FTS_SORT_PRINT(str) \
+ do { \
+ ut_print_timestamp(stderr); \
+ fprintf(stderr, str); \
+ } while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif /* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+ dict_index_t* index, /*!< in: Original FTS index
+ based on which this sort index
+ is created */
+ dict_table_t* table, /*!< in,out: table that FTS index
+ is being created on */
+ ibool* opt_doc_id_size);
+ /*!< out: whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort */
+
+/** Initialize FTS parallel sort structures.
+@param[in] trx transaction
+@param[in,out] dup descriptor of FTS index being created
+@param[in] new_table table where indexes are created
+@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes
+ integer to store Doc ID during sort
+@param[in] old_zip_size page size of the old table during alter
+@param[out] psort parallel sort info to be instantiated
+@param[out] merge parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+ trx_t* trx,
+ row_merge_dup_t*dup,
+ dict_table_t* new_table,
+ bool opt_doc_id_size,
+ ulint old_zip_size,
+ fts_psort_t** psort,
+ fts_psort_t** merge)
+ MY_ATTRIBUTE((nonnull));
+
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ fts_psort_t* merge_info); /*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+ fts_psort_t* psort_info); /*!< in: parallel sort info */
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+ fts_psort_t* psort_info); /*!< in: parallel sort info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+ fts_psort_t* merge_info); /*!< in: parallel sort info */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+ int propogated, /*<! in: tree node propagated */
+ int* sel_tree, /*<! in: selection tree */
+ ulint level, /*<! in: selection tree level */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index); /*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ dict_table_t* table, /*!< in: new table */
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ ulint id) /* !< in: which auxiliary table's data
+ to insert to */
+ MY_ATTRIBUTE((nonnull));
+#endif /* row0ftsort_h */
diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h
new file mode 100644
index 00000000..fd2651da
--- /dev/null
+++ b/storage/innobase/include/row0import.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.h
+Header file for import tablespace functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0import_h
+#define row0import_h
+
+#include "dict0types.h"
+
+// Forward declarations
+struct trx_t;
+struct dict_table_t;
+struct row_prebuilt_t;
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+ dict_table_t* table, /*!< in/out: table */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct
+ in MySQL */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out] trx dictionary transaction
+@param[in] table_id table identifier
+@param[in] discarded whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+ bool discarded)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out] trx dictionary transaction
+@param[in,out] table persistent table
+@param[in] reset whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif /* row0import_h */
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
new file mode 100644
index 00000000..ac2479c4
--- /dev/null
+++ b/storage/innobase/include/row0ins.h
@@ -0,0 +1,224 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include <vector>
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE If we want to check that
+ the referenced table is ok, FALSE if we
+ want to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row); /*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint n_uniq, /*!< in: 0 or index->n_uniq */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr) /*!< in: query thread or NULL */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: secondary index */
+ mem_heap_t* offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during
+ row_log_table_apply(), or 0 */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ MY_ATTRIBUTE((warn_unused_result));
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+ dict_index_t* index, /*!< in: secondary index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ bool check_foreign = true) /*!< in: true if check
+ foreign table is needed, false otherwise */
+ MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* Insert node types */
+#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */
+#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */
+#define INS_DIRECT 2 /* this is for internal use in dict0crea:
+ insert the row directly */
+
+/* Node execution states */
+#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */
+#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and
+ inserted */
+
+struct row_prebuilt_t;
+
+/** Insert node structure */
+struct ins_node_t
+{
+ explicit ins_node_t(ulint ins_type, dict_table_t *table) :
+ common(QUE_NODE_INSERT, NULL),
+ ins_type(ins_type),
+ row(NULL), table(table), select(NULL), values_list(NULL),
+ state(INS_NODE_SET_IX_LOCK), index(NULL),
+ entry_list(), entry(entry_list.end()),
+ trx_id(0), entry_sys_heap(mem_heap_create(128))
+ {
+ }
+ ~ins_node_t() { mem_heap_free(entry_sys_heap); }
+ que_common_t common; /*!< node type: QUE_NODE_INSERT */
+ ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+ dtuple_t* row; /*!< row to insert */
+ dict_table_t* table; /*!< table where to insert */
+ sel_node_t* select; /*!< select in searched insert */
+ que_node_t* values_list;/* list of expressions to evaluate and
+ insert in an INS_VALUES insert */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index where the index
+ entry should be inserted */
+ std::vector<dtuple_t*>
+ entry_list;/* list of entries, one for each index */
+ std::vector<dtuple_t*>::iterator
+ entry; /*!< NULL, or entry to insert in the index;
+ after a successful insert of the entry,
+ this should be reset to NULL */
+ /** buffer for the system columns */
+ byte sys_buf[DATA_ROW_ID_LEN
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+ trx_id_t trx_id; /*!< trx id or the last trx which executed the
+ node */
+ byte vers_start_buf[8]; /* Buffers for System Versioning */
+ byte vers_end_buf[8]; /* system fields. */
+ mem_heap_t* entry_sys_heap;
+ /* memory heap used as auxiliary storage;
+ entry_list and sys fields are stored here;
+ if this is NULL, entry list should be created
+ and buffers for sys fields in row allocated */
+ void vers_update_end(row_prebuilt_t *prebuilt, bool history_row);
+};
+
+/** Create an insert object.
+@param ins_type INS_VALUES, ...
+@param table table where to insert
+@param heap memory heap
+@return the created object */
+inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table,
+ mem_heap_t *heap)
+{
+ return new (mem_heap_alloc(heap, sizeof(ins_node_t)))
+ ins_node_t(ins_type, table);
+}
+
+#endif
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
new file mode 100644
index 00000000..469f1f8a
--- /dev/null
+++ b/storage/innobase/include/row0log.h
@@ -0,0 +1,239 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.h
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "mtr0types.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#include "trx0undo.h"
+
+class ut_stage_alter_t;
+
+extern Atomic_counter<ulint> onlineddl_rowlog_rows;
+extern ulint onlineddl_rowlog_pct_used;
+extern ulint onlineddl_pct_progress;
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+ const trx_t* trx, /*!< in: the ALTER TABLE transaction */
+ dict_index_t* index, /*!< in/out: index */
+ dict_table_t* table, /*!< in/out: new table being rebuilt,
+ or NULL when creating a secondary index */
+ bool same_pk,/*!< in: whether the definition of the
+ PRIMARY KEY has remained the same */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added, changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL if !table */
+ const char* path, /*!< in: where to create temporary file */
+ const TABLE* old_table, /*!< in:table definition before alter */
+ bool allow_not_null) /*!< in: allow null to non-null
+ conversion */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+ row_log_t* log) /*!< in,own: row log */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+inline void row_log_abort_sec(dict_index_t *index)
+{
+ ut_ad(index->lock.have_u_or_x());
+ ut_ad(!index->is_clust());
+ dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+ row_log_free(index->online_log);
+ index->online_log= nullptr;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param index index, S or X latched
+@param tuple index tuple
+@param trx_id transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+ trx_id_t trx_id) ATTRIBUTE_COLD;
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+ const dict_index_t* index) /*!< in: clustered index of a table
+ that is being rebuilt online */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in] index cluster index
+@param[in] v_no virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+ const dict_index_t* index,
+ ulint v_no);
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+ be logged, or NULL to use those in rec */
+ ATTRIBUTE_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Logs an update operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const dtuple_t* old_pk);/*!< in: row_log_table_get_pk()
+ before the update */
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index),
+ or NULL */
+ byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for
+ row_log_table_delete(), or NULL */
+ mem_heap_t** heap) /*!< in/out: memory heap where allocated */
+ ATTRIBUTE_COLD __attribute__((nonnull(1,2,5), warn_unused_result));
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets);/*!< in: rec_get_offsets(rec,index) */
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in] thr query graph
+@param[in] old_table old table
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in] new_table Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+ que_thr_t* thr,
+ dict_table_t* old_table,
+ struct TABLE* table,
+ ut_stage_alter_t* stage,
+ dict_table_t* new_table)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+ dict_index_t* index) /*!< in: index, must be locked */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Apply the row log to the index upon completing index creation.
+@param[in] trx transaction (for checking if the operation was
+interrupted)
+@param[in,out] index secondary index
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+ const trx_t* trx,
+ dict_index_t* index,
+ struct TABLE* table,
+ ut_stage_alter_t* stage)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Get the n_core_fields of online log for the index
+@param index index whose n_core_fields of log to be accessed
+@return number of n_core_fields */
+unsigned row_log_get_n_core_fields(const dict_index_t *index);
+
+/** Get the error code of online log for the index
+@param index online index
+@return error code present in online log */
+dberr_t row_log_get_error(const dict_index_t *index);
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in] index index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+ const dict_index_t* index);
+#endif /* HAVE_PSI_STAGE_INTERFACE */
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
new file mode 100644
index 00000000..93ea650d
--- /dev/null
+++ b/storage/innobase/include/row0merge.h
@@ -0,0 +1,496 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+#include "srv0srv.h"
+
+class ut_stage_alter_t;
+
+/* Reserve free space from every block for key_version */
+#define ROW_MERGE_RESERVE_SIZE 4
+
+/* Cluster index read task is mandatory */
+#define COST_READ_CLUSTERED_INDEX 1.0
+
+/* Basic fixed cost to build all type of index */
+#define COST_BUILD_INDEX_STATIC 0.5
+/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
+#define COST_BUILD_INDEX_DYNAMIC 0.5
+
+/* Sum of below two must be 1.0 */
+#define PCT_COST_MERGESORT_INDEX 0.4
+#define PCT_COST_INSERT_INDEX 0.6
+
+// Forward declaration
+struct ib_sequence_t;
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is srv_page_size, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as srv_page_size / 2. */
+typedef byte row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t. Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte mrec_t;
+
+/** Merge record in row_merge_buf_t */
+struct mtuple_t {
+ dfield_t* fields; /*!< data fields */
+};
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_t {
+ mem_heap_t* heap; /*!< memory heap where allocated */
+ dict_index_t* index; /*!< the index the tuples belong to */
+ ulint total_size; /*!< total amount of data bytes */
+ ulint n_tuples; /*!< number of data tuples */
+ ulint max_tuples; /*!< maximum number of data tuples */
+ mtuple_t* tuples; /*!< array of data tuples */
+ mtuple_t* tmp_tuples; /*!< temporary copy of tuples,
+ for sorting */
+};
+
+/** Information about temporary files used in merge sort */
+struct merge_file_t {
+ pfs_os_file_t fd; /*!< file descriptor */
+ ulint offset; /*!< file offset (end of file) */
+ ib_uint64_t n_rec; /*!< number of records in the file */
+};
+
+/** Index field definition */
+struct index_field_t {
+ ulint col_no; /*!< column offset */
+ ulint prefix_len; /*!< column prefix length, or 0
+ if indexing the whole column */
+ bool is_v_col; /*!< whether this is a virtual column */
+ bool descending; /*!< whether to use DESC order */
+};
+
+/** Definition of an index being created */
+struct index_def_t {
+ const char* name; /*!< index name */
+ bool rebuild; /*!< whether the table is rebuilt */
+ ulint ind_type; /*!< 0, DICT_UNIQUE,
+ or DICT_CLUSTERED */
+ ulint key_number; /*!< MySQL key number,
+ or ULINT_UNDEFINED if none */
+ ulint n_fields; /*!< number of fields in index */
+ index_field_t* fields; /*!< field definitions */
+ st_mysql_ftparser*
+ parser; /*!< fulltext parser plugin */
+};
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_t {
+ dict_index_t* index; /*!< index being sorted */
+ struct TABLE* table; /*!< MySQL table object */
+ const ulint* col_map;/*!< mapping of column numbers
+ in table to the rebuilt table
+ (index->table), or NULL if not
+ rebuilding table */
+ ulint n_dup; /*!< number of duplicates */
+};
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
+ const dfield_t* entry) /*!< in: duplicate index entry */
+ MY_ATTRIBUTE((nonnull));
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx dictionary transaction
+@param table table containing the indexes
+@param locked True if table is locked,
+ false - may need to do lazy drop
+@param alter_trx Alter table transaction */
+void
+row_merge_drop_indexes(
+ trx_t* trx,
+ dict_table_t* table,
+ bool locked,
+ const trx_t* alter_trx=NULL);
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes();
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in] path location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+ const char* path)
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+ const pfs_os_file_t& fd); /*!< in: merge file descriptor */
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ table_id_t table_id, /*!< in: table identifier */
+ index_id_t index_id) /*!< in: index identifier */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Create the index and load in to the dictionary.
+@param[in,out] table the index is on this table
+@param[in] index_def the index definition
+@param[in] add_v new virtual columns added along with add
+ index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+ dict_table_t* table,
+ const index_def_t* index_def,
+ const dict_add_v_col_t* add_v)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return whether the index can be used by the transaction */
+bool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index) /*!< in: index to check */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Map from column numbers to column definitions that include
+changes to the collation, when the encoding is compatible with
+the original column and no table rebuild is needed */
+typedef std::map<unsigned, dict_col_t*> col_collations;
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in] trx transaction
+@param[in] old_table table where rows are read from
+@param[in] new_table table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in] online true if creating indexes online
+@param[in] indexes indexes to be created
+@param[in] key_numbers MySQL key numbers
+@param[in] n_indexes size of indexes[]
+@param[in,out] table MySQL table, for reporting erroneous key value
+if applicable
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] col_map mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out] sequence autoinc sequence
+@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
+existing order
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in] add_v new virtual columns added along with indexes
+@param[in] eval_table mysql table used to evaluate virtual column
+ value, see innobase_get_computed_value().
+@param[in] allow_non_null allow the conversion from null to not-null
+@param[in] col_collate columns whose collations changed, or nullptr
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+ trx_t* trx,
+ dict_table_t* old_table,
+ dict_table_t* new_table,
+ bool online,
+ dict_index_t** indexes,
+ const ulint* key_numbers,
+ ulint n_indexes,
+ struct TABLE* table,
+ const dtuple_t* defaults,
+ const ulint* col_map,
+ ulint add_autoinc,
+ ib_sequence_t& sequence,
+ bool skip_pk_sort,
+ ut_stage_alter_t* stage,
+ const dict_add_v_col_t* add_v,
+ struct TABLE* eval_table,
+ bool allow_non_null,
+ const col_collations* col_collate)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Write a buffer to a block.
+@param buf sorted buffer
+@param block buffer for writing to file
+@param blob_file blob file handle for doing bulk insert operation */
+dberr_t row_merge_buf_write(const row_merge_buf_t *buf,
+#ifndef DBUG_OFF
+ const merge_file_t *of, /*!< output file */
+#endif
+ row_merge_block_t *block,
+ merge_file_t *blob_file= nullptr);
+
+/********************************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+ row_merge_buf_t* buf, /*!< in/out: sort buffer */
+ row_merge_dup_t* dup) /*!< in/out: reporter of duplicates
+ (NULL if non-unique index) */
+ MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval false on error
+@retval true on success */
+bool
+row_merge_write(
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to write,
+ in number of row_merge_block_t elements */
+ const void* buf, /*!< in: data */
+ void* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Create a merge file in the given location.
+@param[out] merge_file merge file structure
+@param[in] path location for creating temporary file, or NULL
+@return file descriptor, or -1 on failure */
+pfs_os_file_t
+row_merge_file_create(
+ merge_file_t* merge_file,
+ const char* path)
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1)));
+
+/** Merge disk files.
+@param[in] trx transaction
+@param[in] dup descriptor of index being created
+@param[in,out] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] tmpfd temporary file handle
+@param[in] update_progress true, if we should update progress status
+@param[in] pct_progress total progress percent until now
+@param[in] pct_ocst current progress percent
+@param[in] crypt_block crypt buf or NULL
+@param[in] space space_id
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+/*===========*/
+ trx_t* trx,
+ const row_merge_dup_t* dup,
+ merge_file_t* file,
+ row_merge_block_t* block,
+ pfs_os_file_t* tmpfd,
+ const bool update_progress,
+ const double pct_progress,
+ const double pct_cost,
+ row_merge_block_t* crypt_block,
+ ulint space,
+ ut_stage_alter_t* stage = NULL)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+ dict_index_t* index) /*!< in: secondary index */
+ MY_ATTRIBUTE((warn_unused_result, nonnull, malloc));
+
+/*********************************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+ merge_file_t* merge_file) /*!< in/out: merge file structure */
+ MY_ATTRIBUTE((nonnull));
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to read
+ in number of row_merge_block_t
+ elements */
+ row_merge_block_t* buf, /*!< out: data */
+ row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ const byte* b, /*!< in: pointer to record */
+ const dict_index_t* index, /*!< in: index of the record */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t** mrec, /*!< out: pointer to merge record,
+ or NULL on end of list
+ (non-NULL on I/O error) */
+ rec_offs* offsets,/*!< out: offsets of mrec */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Buffer for bulk insert */
+class row_merge_bulk_t
+{
+ /** Buffer for each index in the table. main memory
+ buffer for sorting the index */
+ row_merge_buf_t *m_merge_buf;
+ /** Block for IO operation */
+ row_merge_block_t *m_block= nullptr;
+ /** File to store the buffer and used for merge sort */
+ merge_file_t *m_merge_files= nullptr;
+ /** Temporary file to be used for merge sort */
+ pfs_os_file_t m_tmpfd;
+ /** Allocate memory for merge file data structure */
+ ut_allocator<row_merge_block_t> m_alloc;
+ /** Storage for description for the m_alloc */
+ ut_new_pfx_t m_block_pfx;
+ /** Temporary file to store the blob */
+ merge_file_t m_blob_file;
+ /** Storage for description for the crypt_block */
+ ut_new_pfx_t m_crypt_pfx;
+ /** Block for encryption */
+ row_merge_block_t *m_crypt_block= nullptr;
+public:
+ /** Constructor.
+ Create all merge files, merge buffer for all the table indexes
+ expect fts indexes.
+ Create a merge block which is used to write IO operation
+ @param table table which undergoes bulk insert operation */
+ row_merge_bulk_t(dict_table_t *table);
+
+ /** Destructor.
+ Remove all merge files, merge buffer for all table indexes. */
+ ~row_merge_bulk_t();
+
+ /** Remove all buffer for the table indexes */
+ void remove_all_bulk_buffer();
+
+ /** Clean the merge buffer for the given index number */
+ void clean_bulk_buffer(ulint index_no);
+
+ /** Create the temporary file for the given index number
+ @retval true if temporary file creation went well */
+ bool create_tmp_file(ulint index_no);
+
+ /** Write the merge buffer to the tmp file for the given
+ index number.
+ @param index_no buffer to be written for the index */
+ dberr_t write_to_tmp_file(ulint index_no);
+
+ /** Add the tuple to the merge buffer for the given index.
+ If the buffer ran out of memory then write the buffer into
+ the temporary file and do insert the tuple again.
+ @param row tuple to be inserted
+ @param ind index to be buffered
+ @param trx bulk transaction */
+ dberr_t bulk_insert_buffered(const dtuple_t &row, const dict_index_t &ind,
+ trx_t *trx);
+
+ /** Do bulk insert operation into the index tree from
+ buffer or merge file if exists
+ @param index_no index to be inserted
+ @param trx bulk transaction */
+ dberr_t write_to_index(ulint index_no, trx_t *trx);
+
+ /** Do bulk insert for the buffered insert for the table.
+ @param table table which undergoes for bulk insert operation
+ @param trx bulk transaction */
+ dberr_t write_to_table(dict_table_t *table, trx_t *trx);
+
+ /** Allocate block for writing the buffer into disk */
+ dberr_t alloc_block();
+
+ /** Init temporary files for each index */
+ void init_tmp_file();
+};
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
new file mode 100644
index 00000000..878d9c9f
--- /dev/null
+++ b/storage/innobase/include/row0mysql.h
@@ -0,0 +1,841 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "fil0fil.h"
+#include "fts0fts.h"
+#include "gis0type.h"
+
+struct row_prebuilt_t;
+class ha_innobase;
+class ha_handler_stats;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen);/*!< in: storage length of len: either 1
+ or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len); /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len); /*!< in: BLOB reference length
+ (not BLOB length) */
+/*******************************************************************//**
+Converts InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+ byte* dest, /*!< in/out: where to store */
+ ulint dest_len, /*!< in: dest buffer size: determines into
+ how many bytes the geometry length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const byte* src, /*!< in: geometry data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint src_len); /*!< in: geometry length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+ ulint mbminlen, /*!< in: minimum size of a character,
+ in bytes */
+ byte* pad, /*!< out: padded buffer */
+ ulint len); /*!< in: number of bytes to pad */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! NOTE that dfield
+ may also get a pointer to 'buf',
+ therefore do not discard this as long
+ as dfield is used! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp); /*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread */
+bool
+row_mysql_handle_errors(
+/*====================*/
+ dberr_t* new_err,/*!< out: possible new error encountered in
+ rollback, or the old error which was
+ during the function entry */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread, or NULL */
+ trx_savept_t* savept) /*!< in: savepoint, or NULL */
+ MY_ATTRIBUTE((nonnull(1,2)));
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table, /*!< in: Innobase table handle */
+ ulint mysql_row_len); /*!< in: length in bytes of a row in
+ the MySQL format */
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt);
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx); /*!< in: transaction handle */
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL
+ table handle */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock a table.
+@param[in,out] prebuilt table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt);
+
+/** System Versioning: row_insert_for_mysql() modes */
+enum ins_mode_t {
+ /* plain row (without versioning) */
+ ROW_INS_NORMAL = 0,
+ /* row_start = TRX_ID, row_end = MAX */
+ ROW_INS_VERSIONED,
+ /* row_end = TRX_ID */
+ ROW_INS_HISTORICAL
+};
+
+/** Does an insert for MySQL.
+@param[in] mysql_rec row in the MySQL format
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@param[in] ins_mode what row type we're inserting
+@return error code or DB_SUCCESS*/
+dberr_t
+row_insert_for_mysql(
+ const byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ ins_mode_t ins_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/** Does an update or delete of a row for MySQL.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(
+ row_prebuilt_t* prebuilt)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_mvcc() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@param[in] has_latches_on_recs TRUE if called so that we have the
+ latches on the records under pcur
+ and clust_pcur, and we do not need
+ to reposition the cursors. */
+void
+row_unlock_for_mysql(
+ row_prebuilt_t* prebuilt,
+ ibool has_latches_on_recs);
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap); /*!< in: mem heap from which allocated */
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table) /*!< in: table where we do the operation */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock the data dictionary cache exclusively. */
+#define row_mysql_lock_data_dictionary(trx) \
+ do { \
+ ut_ad(!trx->dict_operation_lock_mode); \
+ dict_sys.lock(SRW_LOCK_CALL); \
+ trx->dict_operation_lock_mode = true; \
+ } while (0)
+
+/** Unlock the data dictionary. */
+#define row_mysql_unlock_data_dictionary(trx) \
+ do { \
+ ut_ad(!lock_trx_has_sys_table_locks(trx)); \
+ ut_ad(trx->dict_operation_lock_mode); \
+ trx->dict_operation_lock_mode = false; \
+ dict_sys.unlock(); \
+ } while (0)
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed, or on DB_SUCCESS
+ added to the data dictionary cache) */
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths, /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id) /*!< in: encryption key_id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_tablespace_for_mysql(
+/*============================*/
+ dict_table_t* table, /*!< in/out: table */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in/out: transaction */
+ bool use_fk) /*!< in: whether to parse and enforce
+ FOREIGN KEY constraints */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+struct mysql_row_templ_t {
+ ulint col_no; /*!< column number of the column */
+ ulint rec_field_no; /*!< field number of the column in an
+ Innobase record in the current index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ibool rec_field_is_prefix; /* is this field in a prefix index? */
+ ulint rec_prefix_field_no; /* record field, even if just a
+ prefix; same as rec_field_no when not a
+ prefix, otherwise rec_field_no is
+ ULINT_UNDEFINED but this is the true
+ field number*/
+ ulint clust_rec_field_no; /*!< field number of the column in an
+ Innobase record in the clustered index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ulint icp_rec_field_no; /*!< field number of the column in an
+ Innobase record in the current index;
+ not defined unless
+ index condition pushdown is used */
+ ulint mysql_col_offset; /*!< offset of the column in the MySQL
+ row format */
+ ulint mysql_col_len; /*!< length of the column in the MySQL
+ row format */
+ ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a
+ MySQL record */
+ ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit,
+ zero if column cannot be NULL */
+ ulint type; /*!< column type in Innobase mtype
+ numbers DATA_CHAR... */
+ ulint mysql_type; /*!< MySQL type code; this is always
+ < 256 */
+ ulint mysql_length_bytes; /*!< if mysql_type
+ == DATA_MYSQL_TRUE_VARCHAR, this tells
+ whether we should use 1 or 2 bytes to
+ store the MySQL true VARCHAR data
+ length at the start of row in the MySQL
+ format (NOTE that the MySQL key value
+ format always uses 2 bytes for the data
+ len) */
+ ulint charset; /*!< MySQL charset-collation code
+ of the column, or zero */
+ ulint mbminlen; /*!< minimum length of a char, in bytes,
+ or zero if not a char type */
+ ulint mbmaxlen; /*!< maximum length of a char, in bytes,
+ or zero if not a char type */
+ ulint is_unsigned; /*!< if a column type is an integer
+ type and this field is != 0, then
+ it is an unsigned integer type */
+ ulint is_virtual; /*!< if a column is a virtual column */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE 8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD 4
+
+#define ROW_PREBUILT_ALLOCATED 78540783
+#define ROW_PREBUILT_FREED 26423527
+
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_t {
+ ulint magic_n; /*!< this magic number is set to
+ ROW_PREBUILT_ALLOCATED when created,
+ or ROW_PREBUILT_FREED when the
+ struct has been freed */
+ dict_table_t* table; /*!< Innobase table handle */
+ dict_index_t* index; /*!< current index for a search, if
+ any */
+ trx_t* trx; /*!< current transaction handle */
+ unsigned sql_stat_start:1;/*!< TRUE when we start processing of
+ an SQL statement: we may have to set
+ an intention lock on the table,
+ create a consistent read view etc. */
+ unsigned clust_index_was_generated:1;
+ /*!< if the user did not define a
+ primary key in MySQL, then Innobase
+ automatically generated a clustered
+ index where the ordering column is
+ the row id: in this case this flag
+ is set to TRUE */
+ unsigned index_usable:1; /*!< caches the value of
+ row_merge_is_index_usable(trx,index) */
+ unsigned read_just_key:1;/*!< set to 1 when MySQL calls
+ ha_innobase::extra with the
+ argument HA_EXTRA_KEYREAD; it is enough
+ to read just columns defined in
+ the index (i.e., no read of the
+ clustered index record necessary) */
+ unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this
+ handle in a MySQL HANDLER low level
+ index cursor command: then we must
+ store the pcur position even in a
+ unique search from a clustered index,
+ because HANDLER allows NEXT and PREV
+ in such a situation */
+ unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+ ROW_MYSQL_REC_FIELDS,
+ ROW_MYSQL_DUMMY_TEMPLATE, or
+ ROW_MYSQL_NO_TEMPLATE */
+ unsigned n_template:10; /*!< number of elements in the
+ template */
+ unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+ bitmap at the start of a row in the
+ MySQL format */
+ unsigned need_to_access_clustered:1; /*!< if we are fetching
+ columns through a secondary index
+ and at least one column is not in
+ the secondary index, then this is
+ set to TRUE; note that sometimes this
+ is set but we later optimize out the
+ clustered index lookup */
+ unsigned templ_contains_blob:1;/*!< TRUE if the template contains
+ a column with DATA_LARGE_MTYPE(
+ get_innobase_type_from_mysql_type())
+ is TRUE;
+ not to be confused with InnoDB
+ externally stored columns
+ (VARCHAR can be off-page too) */
+ unsigned versioned_write:1;/*!< whether this is
+ a versioned write */
+ mysql_row_templ_t* mysql_template;/*!< template used to transform
+ rows fast between MySQL and Innobase
+ formats; memory for this template
+ is not allocated from 'heap' */
+ mem_heap_t* heap; /*!< memory heap from which
+ these auxiliary structures are
+ allocated when needed */
+ ins_node_t* ins_node; /*!< Innobase SQL insert node
+ used to perform inserts
+ to the table */
+ byte* ins_upd_rec_buff;/*!< buffer for storing data converted
+ to the Innobase format from the MySQL
+ format */
+ const byte* default_rec; /*!< the default values of all columns
+ (a "default row") in MySQL format */
+ ulint hint_need_to_fetch_extra_cols;
+ /*!< normally this is set to 0; if this
+ is set to ROW_RETRIEVE_PRIMARY_KEY,
+ then we should at least retrieve all
+ columns in the primary key; if this
+ is set to ROW_RETRIEVE_ALL_COLS, then
+ we must retrieve all columns in the
+ key (if read_just_key == 1), or all
+ columns in the table */
+ upd_node_t* upd_node; /*!< Innobase SQL update node used
+ to perform updates and deletes */
+ trx_id_t trx_id; /*!< The table->def_trx_id when
+ ins_graph was built */
+ que_fork_t* ins_graph; /*!< Innobase SQL query graph used
+ in inserts. Will be rebuilt on
+ trx_id or n_indexes mismatch. */
+ que_fork_t* upd_graph; /*!< Innobase SQL query graph used
+ in updates or deletes */
+ btr_pcur_t* pcur; /*!< persistent cursor used in selects
+ and updates */
+ btr_pcur_t* clust_pcur; /*!< persistent cursor used in
+ some selects and updates */
+ que_fork_t* sel_graph; /*!< dummy query graph used in
+ selects */
+ dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */
+ byte row_id[DATA_ROW_ID_LEN];
+ /*!< if the clustered index was
+ generated, the row id of the
+ last row fetched is stored
+ here */
+ doc_id_t fts_doc_id; /* if the table has an FTS index on
+ it then we fetch the doc_id.
+ FTS-FIXME: Currently we fetch it always
+ but in the future we must only fetch
+ it when FTS columns are being
+ updated */
+ dtuple_t* clust_ref; /*!< prebuilt dtuple used in
+ sel/upd/del */
+ lock_mode select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+ bool skip_locked; /*!< TL_{READ,WRITE}_SKIP_LOCKED */
+ lock_mode stored_select_lock_type;/*!< this field is used to
+ remember the original select_lock_type
+ that was decided in ha_innodb.cc,
+ ::store_lock(), ::external_lock(),
+ etc. */
+ ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks
+ should be the obtained for records
+ under an UPDATE or DELETE cursor.
+ At READ UNCOMMITTED or
+ READ COMMITTED isolation level,
+ this can be set to
+ ROW_READ_TRY_SEMI_CONSISTENT, so that
+ if the row under an UPDATE or DELETE
+ cursor was locked by another
+ transaction, InnoDB will resort
+ to reading the last committed value
+ ('semi-consistent read'). Then,
+ this field will be set to
+ ROW_READ_DID_SEMI_CONSISTENT to
+ indicate that. If the row does not
+ match the WHERE condition, MySQL will
+ invoke handler::unlock_row() to
+ clear the flag back to
+ ROW_READ_TRY_SEMI_CONSISTENT and
+ to simply skip the row. If
+ the row matches, the next call to
+ row_search_mvcc() will lock
+ the row.
+ This eliminates lock waits in some
+ cases; note that this breaks
+ serializability. */
+ ulint new_rec_locks; /*!< normally 0; if
+ the session is using READ
+ COMMITTED or READ UNCOMMITTED
+ isolation level, set in
+ row_search_mvcc() if we set a new
+ record lock on the secondary
+ or clustered index; this is
+ used in row_unlock_for_mysql()
+ when releasing the lock under
+ the cursor if we determine
+ after retrieving the row that
+ it does not need to be locked
+ ('mini-rollback') */
+ ulint mysql_prefix_len;/*!< byte offset of the end of
+ the last requested column */
+ ulint mysql_row_len; /*!< length in bytes of a row in the
+ MySQL format */
+ ulint n_rows_fetched; /*!< number of rows fetched after
+ positioning the current cursor */
+ ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+ byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+ /*!< a cache for fetched rows if we
+ fetch many rows from the same cursor:
+ it saves CPU time to fetch them in a
+ batch; we reserve mysql_row_len
+ bytes for each such row; these
+ pointers point 4 bytes past the
+ allocated mem buf start, because
+ there is a 4 byte magic number at the
+ start and at the end */
+ bool keep_other_fields_on_keyread; /*!< when using fetch
+ cache with HA_EXTRA_KEYREAD, don't
+ overwrite other fields in mysql row
+ row buffer.*/
+ ulint fetch_cache_first;/*!< position of the first not yet
+ fetched row in fetch_cache */
+ ulint n_fetch_cached; /*!< number of not yet fetched rows
+ in fetch_cache */
+ mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied
+ to this heap */
+ mem_heap_t* old_vers_heap; /*!< memory heap where a previous
+ version is built in consistent read */
+ bool in_fts_query; /*!< Whether we are in a FTS query */
+ bool fts_doc_id_in_read_set; /*!< true if table has externally
+ defined FTS_DOC_ID coulmn. */
+ /*----------------------*/
+ ulonglong autoinc_last_value;
+ /*!< last value of AUTO-INC interval */
+ ulonglong autoinc_increment;/*!< The increment step of the auto
+ increment column. Value must be
+ greater than or equal to 1. Required to
+ calculate the next value */
+ ulonglong autoinc_offset; /*!< The offset passed to
+ get_auto_increment() by MySQL. Required
+ to calculate the next value */
+ dberr_t autoinc_error; /*!< The actual error code encountered
+ while trying to init or read the
+ autoinc value from the table. We
+ store it here so that we can return
+ it to MySQL */
+ /*----------------------*/
+
+ /** Argument of handler_rowid_filter_check(),
+ or NULL if no PRIMARY KEY filter is pushed */
+ ha_innobase* pk_filter;
+
+ /** Argument to handler_index_cond_check(),
+ or NULL if no index condition pushdown (ICP) is used. */
+ ha_innobase* idx_cond;
+ ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols.
+ 0 if and only if idx_cond == NULL. */
+ /*----------------------*/
+
+ /*----------------------*/
+ rtr_info_t* rtr_info; /*!< R-tree Search Info */
+ /*----------------------*/
+
+ ulint magic_n2; /*!< this should be the same as
+ magic_n */
+
+ byte* srch_key_val1; /*!< buffer used in converting
+ search key values from MySQL format
+ to InnoDB format.*/
+ byte* srch_key_val2; /*!< buffer used in converting
+ search key values from MySQL format
+ to InnoDB format.*/
+ uint srch_key_val_len; /*!< Size of search key */
+ /** The MySQL table object */
+ TABLE* m_mysql_table;
+
+ /** Get template by dict_table_t::cols[] number */
+ const mysql_row_templ_t* get_template_by_col(ulint col) const
+ {
+ ut_ad(col < n_template);
+ ut_ad(mysql_template);
+ for (ulint i = col; i < n_template; ++i) {
+ const mysql_row_templ_t* templ = &mysql_template[i];
+ if (!templ->is_virtual && templ->col_no == col) {
+ return templ;
+ }
+ }
+ return NULL;
+ }
+};
+
+/** Callback for row_mysql_sys_index_iterate() */
+struct SysIndexCallback {
+ virtual ~SysIndexCallback() = default;
+
+ /** Callback method
+ @param mtr current mini transaction
+ @param pcur persistent cursor. */
+ virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0;
+};
+
+
+/** Storage for calculating virtual columns */
+
+class String;
+struct VCOL_STORAGE
+{
+ TABLE *maria_table;
+ byte *innobase_record;
+ byte *maria_record;
+ String *blob_value_storage;
+ VCOL_STORAGE(): maria_table(NULL), innobase_record(NULL),
+ maria_record(NULL), blob_value_storage(NULL) {}
+};
+
+/**
+ Allocate a heap and record for calculating virtual fields
+ Used mainly for virtual fields in indexes
+
+@param[in] thd MariaDB THD
+@param[in] index Index in use
+@param[out] heap Heap that holds temporary row
+@param[in,out] mysql_table MariaDB table
+@param[out] rec Pointer to allocated MariaDB record
+@param[out] storage Internal storage for blobs etc
+
+@return FALSE ok
+@return TRUE malloc failure
+*/
+
+bool innobase_allocate_row_for_vcol(THD *thd,
+ const dict_index_t* index,
+ mem_heap_t** heap,
+ TABLE** table,
+ VCOL_STORAGE* storage);
+
+/** Free memory allocated by innobase_allocate_row_for_vcol() */
+void innobase_free_row_for_vcol(VCOL_STORAGE *storage);
+
+class ib_vcol_row
+{
+ VCOL_STORAGE storage;
+public:
+ mem_heap_t *heap;
+
+ ib_vcol_row(mem_heap_t *heap) : heap(heap) {}
+
+ byte *record(THD *thd, const dict_index_t *index, TABLE **table)
+ {
+ if (!storage.innobase_record &&
+ !innobase_allocate_row_for_vcol(thd, index, &heap, table, &storage))
+ return nullptr;
+ return storage.innobase_record;
+ }
+
+ ~ib_vcol_row()
+ {
+ if (heap)
+ {
+ if (storage.innobase_record)
+ innobase_free_row_for_vcol(&storage);
+ mem_heap_free(heap);
+ }
+ }
+};
+
+/** Report virtual value computation failure in ib::error
+@param[in] row the data row
+*/
+ATTRIBUTE_COLD
+void innobase_report_computed_value_failed(dtuple_t *row);
+
+/** Get the computed value by supplying the base column values.
+@param[in,out] row the data row
+@param[in] col virtual column
+@param[in] index index on the virtual column
+@param[in,out] local_heap heap memory for processing large data etc.
+@param[in,out] heap memory heap that copies the actual index row
+@param[in] ifield index field
+@param[in] thd connection handle
+@param[in,out] mysql_table MariaDB table handle
+@param[in,out] mysql_rec MariaDB record buffer
+@param[in] old_table during ALTER TABLE, this is the old table
+ or NULL.
+@param[in] update update vector for the parent row
+@param[in] ignore_warnings ignore warnings during calculation. Usually
+ means that a calculation is internal and
+ should have no side effects.
+@return the field filled with computed value */
+dfield_t*
+innobase_get_computed_value(
+ dtuple_t* row,
+ const dict_v_col_t* col,
+ const dict_index_t* index,
+ mem_heap_t** local_heap,
+ mem_heap_t* heap,
+ const dict_field_t* ifield,
+ THD* thd,
+ TABLE* mysql_table,
+ byte* mysql_rec,
+ const dict_table_t* old_table=NULL,
+ const upd_t* update=NULL,
+ bool ignore_warnings=false);
+
+/** Change dbname and table name in table->vc_templ.
+@param[in,out] table the table whose virtual column template
+dbname and tbname to be renamed. */
+void
+innobase_rename_vc_templ(
+ dict_table_t* table);
+
+#define ROW_PREBUILT_FETCH_MAGIC_N 465765687
+
+#define ROW_MYSQL_WHOLE_ROW 0
+#define ROW_MYSQL_REC_FIELDS 1
+#define ROW_MYSQL_NO_TEMPLATE 2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in
+ row_check_index() */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY 1
+#define ROW_RETRIEVE_ALL_COLS 2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS 0
+#define ROW_READ_TRY_SEMI_CONSISTENT 1
+#define ROW_READ_DID_SEMI_CONSISTENT 2
+
+#endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
new file mode 100644
index 00000000..1daf4d4a
--- /dev/null
+++ b/storage/innobase/include/row0purge.h
@@ -0,0 +1,149 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "row0mysql.h"
+#include "mysqld.h"
+#include <queue>
+#include <unordered_map>
+
+class MDL_ticket;
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page). It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out] node row purge node
+@param[in] index secondary index
+@param[in] entry secondary index entry
+@param[in,out] sec_pcur secondary index cursor or NULL
+ if it is called for purge buffering
+ operation.
+@param[in,out] sec_mtr mini-transaction which holds
+ secondary index entry or NULL if it is
+ called for purge buffering operation.
+@param[in] is_tree true=pessimistic purge,
+ false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+ purge_node_t* node,
+ dict_index_t* index,
+ const dtuple_t* entry,
+ btr_pcur_t* sec_pcur=NULL,
+ mtr_t* sec_mtr=NULL,
+ bool is_tree=false);
+
+/***************************************************************
+Does the purge operation.
+@return query thread to run next */
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Purge worker context */
+struct purge_node_t
+{
+ /** node type: QUE_NODE_PURGE */
+ que_common_t common;
+
+ /** DB_TRX_ID of the undo log record */
+ trx_id_t trx_id;
+ /** DB_ROLL_PTR pointing to undo log record */
+ roll_ptr_t roll_ptr;
+
+ /** undo number of the record */
+ undo_no_t undo_no;
+
+ /** record type: TRX_UNDO_INSERT_REC, ... */
+ byte rec_type;
+ /** compiler analysis info of an update */
+ byte cmpl_info;
+ /** whether the clustered index record determined by ref was found
+ in the clustered index of the table, and we were able to position
+ pcur on it */
+ bool found_clust;
+#ifdef UNIV_DEBUG
+ /** whether the operation is in progress */
+ bool in_progress= false;
+#endif
+ /** table where purge is done */
+ dict_table_t *table= nullptr;
+ /** update vector for a clustered index record */
+ upd_t *update;
+ /** row reference to the next row to handle, or nullptr */
+ const dtuple_t *ref;
+ /** nullptr, or a deep copy of the indexed fields of the row to handle */
+ dtuple_t *row;
+ /** nullptr, or the next index of table whose record should be handled */
+ dict_index_t *index;
+ /** memory heap used as auxiliary storage; must be emptied between rows */
+ mem_heap_t *heap;
+ /** persistent cursor to the clustered index record */
+ btr_pcur_t pcur;
+
+ /** Undo recs to purge */
+ std::queue<trx_purge_rec_t> undo_recs;
+
+ /** map of table identifiers to table handles and meta-data locks */
+ std::unordered_map<table_id_t, std::pair<dict_table_t*,MDL_ticket*>> tables;
+
+ /** Constructor */
+ explicit purge_node_t(que_thr_t *parent) :
+ common(QUE_NODE_PURGE, parent), heap(mem_heap_create(256)),
+ tables(TRX_PURGE_TABLE_BUCKETS) {}
+
+#ifdef UNIV_DEBUG
+ /** Validate the persistent cursor. The purge node has two references
+ to the clustered index record: ref and pcur, which must match
+ each other if found_clust.
+ @return whether pcur is consistent with ref */
+ bool validate_pcur();
+#endif
+
+ /** Start processing an undo log record. */
+ inline void start();
+
+ /** Reset the state at end
+ @return the query graph parent */
+ inline que_node_t *end(THD *);
+};
diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h
new file mode 100644
index 00000000..b05b7666
--- /dev/null
+++ b/storage/innobase/include/row0quiesce.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.h
+
+Header file for tablespace quiesce functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0quiesce_h
+#define row0quiesce_h
+
+#include "dict0types.h"
+
+struct trx_t;
+
+/** The version number of the export meta-data text file. */
+#define IB_EXPORT_CFG_VERSION_V1 0x1UL
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or errro code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ ib_quiesce_t state, /*!< in: quiesce state to set */
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+ MY_ATTRIBUTE((nonnull));
+
+#endif /* row0quiesce_h */
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
new file mode 100644
index 00000000..a1350740
--- /dev/null
+++ b/storage/innobase/include/row0row.h
@@ -0,0 +1,431 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "que0types.h"
+#include "ibuf0ibuf.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: record offsets */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Flags for row build type. */
+#define ROW_BUILD_NORMAL 0 /*!< build index row */
+#define ROW_BUILD_FOR_PURGE 1 /*!< build row for purge. */
+#define ROW_BUILD_FOR_UNDO 2 /*!< build row for undo. */
+#define ROW_BUILD_FOR_INSERT 3 /*!< build row for insert. */
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap, /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE
+ or ROW_BUILD_FOR_UNDO */
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap) /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead; the user
+ columns in this table should be
+ the same columns as in index->table */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added, changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in] index clustered index
+@param[in] rec record in the clustered index
+@param[in] offsets rec_get_offsets(rec,index) or NULL
+@param[in] col_table table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] add_v new virtual columns added
+ along with new indexes
+@param[in] col_map mapping of old column
+ numbers to new ones, or NULL
+@param[in] ext cache of externally stored column
+ prefixes, or NULL
+@param[in] heap memory heap from which
+ the memory needed is allocated
+@return own: row built */
+dtuple_t*
+row_build_w_add_vcol(
+ ulint type,
+ const dict_index_t* index,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ const dict_table_t* col_table,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ row_ext_t** ext,
+ mem_heap_t* heap);
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in/out: rec_get_offsets(rec) */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Convert a metadata record to a data tuple.
+@param[in] rec metadata record
+@param[in] index clustered index after instant ALTER TABLE
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] heap memory heap for allocations
+@param[in] info_bits the info_bits after an update
+@param[in] pad whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap,
+ ulint info_bits,
+ bool pad)
+ MY_ATTRIBUTE((nonnull,warn_unused_result));
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ rec_offs* offsets)/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+ MY_ATTRIBUTE((nonnull(1,2,3)));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: secondary index record;
+ must be preserved while ref is used, as we do
+ not copy field values to heap */
+ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return true if found */
+bool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr) /*!< in/out: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in] data data to read
+@param[in] len length of data
+@param[in] mtype mtype of data
+@param[in] unsigned_type if the data is unsigned
+@return the integer value from the data */
+inline
+ib_uint64_t
+row_parse_int(
+ const byte* data,
+ ulint len,
+ ulint mtype,
+ bool unsigned_type);
+
+/** Result of row_search_index_entry */
+enum row_search_result {
+ ROW_FOUND = 0, /*!< the record was found */
+ ROW_NOT_FOUND, /*!< record not found */
+ ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or
+ BTR_DELETE_MARK was specified, the
+ secondary index leaf page was not in
+ the buffer pool, and the operation was
+ enqueued in the insert/delete buffer */
+ ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and
+ row_purge_poss_sec() failed */
+};
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+ const dtuple_t* entry, /*!< in: index entry */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define ROW_COPY_DATA 1
+#define ROW_COPY_POINTERS 2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Prepare to start a mini-transaction to modify an index.
+@param[in,out] mtr mini-transaction
+@param[in,out] index possibly secondary index
+@param[in] pessimistic whether this is a pessimistic operation */
+inline
+void
+row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic)
+{
+ mtr->start();
+
+ switch (index->table->space_id) {
+ case IBUF_SPACE_ID:
+ if (pessimistic
+ && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+ ibuf_free_excess_pages();
+ }
+ break;
+ case SRV_TMP_SPACE_ID:
+ mtr->set_log_mode(MTR_LOG_NO_REDO);
+ break;
+ default:
+ index->set_modified(*mtr);
+ break;
+ }
+
+ log_free_check();
+}
+
+#include "row0row.inl"
+
+#endif
diff --git a/storage/innobase/include/row0row.inl b/storage/innobase/include/row0row.inl
new file mode 100644
index 00000000..e89adb58
--- /dev/null
+++ b/storage/innobase/include/row0row.inl
@@ -0,0 +1,221 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: record offsets */
+{
+ ulint offset;
+ ulint len;
+
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+
+ offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len);
+
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ return(offset);
+}
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(index, offsets);
+ }
+
+ return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(index, offsets);
+ }
+
+ return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap) /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+{
+ dtuple_t* entry;
+
+ ut_ad(dtuple_check_typed(row));
+ entry = row_build_index_entry_low(row, ext, index, heap,
+ ROW_BUILD_NORMAL);
+ ut_ad(!entry || dtuple_check_typed(entry));
+ return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: secondary index record;
+ must be preserved while ref is used, as we do
+ not copy field values to heap */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint field_no;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dtuple_get_n_fields(ref);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ field_no = *(map + i);
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ field = rec_get_nth_field(rec, offsets,
+ field_no, &len);
+ dfield_set_data(dfield, field, len);
+ }
+ }
+}
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in] data data to read
+@param[in] len length of data
+@param[in] mtype mtype of data
+@param[in] unsigned_type if the data is unsigned
+@return the integer value from the data */
+ib_uint64_t
+row_parse_int(
+ const byte* data,
+ ulint len,
+ ulint mtype,
+ bool unsigned_type)
+{
+ ib_uint64_t value = 0;
+
+ switch (mtype) {
+ case DATA_INT:
+
+ ut_a(len <= sizeof value);
+ value = mach_read_int_type(data, len, unsigned_type);
+ break;
+
+ case DATA_FLOAT:
+
+ ut_a(len == sizeof(float));
+ value = static_cast<ib_uint64_t>(mach_float_read(data));
+ break;
+
+ case DATA_DOUBLE:
+
+ ut_a(len == sizeof(double));
+ value = static_cast<ib_uint64_t>(mach_double_read(data));
+ break;
+
+ default:
+ ut_error;
+
+ }
+
+ if (!unsigned_type && static_cast<int64_t>(value) < 0) {
+ value = 0;
+ }
+
+ return(value);
+}
+
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
new file mode 100644
index 00000000..8134c60f
--- /dev/null
+++ b/storage/innobase/include/row0sel.h
@@ -0,0 +1,457 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node); /*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out] buf Where to copy the MySQL row.
+@param[in] cached_rec What to copy (in MySQL row format).
+@param[in] prebuilt prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+ byte* buf,
+ const byte* cached_rec,
+ row_prebuilt_t* prebuilt);
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions; NOTE that dtuple->data
+ may end up pointing inside buf so
+ do not discard that buffer while
+ the tuple is being used. See
+ row_mysql_store_col_in_innobase_format()
+ in the case of DATA_INT */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len); /*!< in: MySQL key value length */
+
+
+/** Search for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out] buf buffer for the fetched row in MySQL format
+@param[in] mode search mode PAGE_CUR_L
+@param[in,out] prebuilt prebuilt struct for the table handler;
+ this contains the info to search_tuple,
+ index; if search tuple contains 0 field then
+ we position the cursor at start or the end of
+ index, depending on 'mode'
+@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+ Note: if this is != 0, then prebuilt must has a
+ pcur with stored position! In opening of a
+ cursor 'direction' should be 0.
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+dberr_t
+row_search_mvcc(
+ byte* buf,
+ page_cur_mode_t mode,
+ row_prebuilt_t* prebuilt,
+ ulint match_mode,
+ ulint direction)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint* n_rows); /*!< out: number of entries
+ seen in the consistent read */
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt index and transaction
+@param n_rows number of records counted
+
+@return error code
+@retval DB_SUCCESS if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the max AUTOINC value from an index.
+@param[in] index index starting with an AUTO_INCREMENT column
+@return the largest AUTO_INCREMENT value
+@retval 0 if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_t{
+ byte* data; /*!< data, or NULL; if not NULL, this field
+ has allocated memory which must be explicitly
+ freed; can be != NULL even when len is
+ UNIV_SQL_NULL */
+ ulint len; /*!< data length or UNIV_SQL_NULL */
+ ulint val_buf_size;
+ /*!< size of memory buffer allocated for data:
+ this can be more than len; this is defined
+ when data != NULL */
+};
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out] buf Where to copy the MySQL row.
+@param[in] cached_rec What to copy (in MySQL row format).
+@param[in] prebuilt prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+ byte* buf,
+ const byte* cached_rec,
+ row_prebuilt_t* prebuilt);
+
+/** Query plan */
+struct plan_t{
+ dict_table_t* table; /*!< table struct in the dictionary
+ cache */
+ dict_index_t* index; /*!< table index used in the search */
+ btr_pcur_t pcur; /*!< persistent cursor used to search
+ the index */
+ ibool asc; /*!< TRUE if cursor traveling upwards */
+ ibool pcur_is_open; /*!< TRUE if pcur has been positioned
+ and we can try to fetch new rows */
+ ibool cursor_at_end; /*!< TRUE if the cursor is open but
+ we know that there are no more
+ qualifying rows left to retrieve from
+ the index tree; NOTE though, that
+ there may still be unprocessed rows in
+ the prefetch stack; always FALSE when
+ pcur_is_open is FALSE */
+ ibool stored_cursor_rec_processed;
+ /*!< TRUE if the pcur position has been
+ stored and the record it is positioned
+ on has already been processed */
+ que_node_t** tuple_exps; /*!< array of expressions
+ which are used to calculate
+ the field values in the search
+ tuple: there is one expression
+ for each field in the search
+ tuple */
+ dtuple_t* tuple; /*!< search tuple */
+ page_cur_mode_t mode; /*!< search mode: PAGE_CUR_G, ... */
+ ulint n_exact_match; /*!< number of first fields in
+ the search tuple which must be
+ exactly matched */
+ ibool unique_search; /*!< TRUE if we are searching an
+ index record with a unique key */
+ ulint n_rows_fetched; /*!< number of rows fetched using pcur
+ after it was opened */
+ ulint n_rows_prefetched;/*!< number of prefetched rows cached
+ for fetch: fetching several rows in
+ the same mtr saves CPU time */
+ ulint first_prefetched;/*!< index of the first cached row in
+ select buffer arrays for each column */
+ ibool no_prefetch; /*!< no prefetch for this table */
+ sym_node_list_t columns; /*!< symbol table nodes for the columns
+ to retrieve from the table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ end_conds; /*!< conditions which determine the
+ fetch limit of the index segment we
+ have to look at: when one of these
+ fails, the result set has been
+ exhausted for the cursor in this
+ index; these conditions are normalized
+ so that in a comparison the column
+ for this table is the first argument */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ other_conds; /*!< the rest of search conditions we can
+ test at this table in a join */
+ ibool must_get_clust; /*!< TRUE if index is a non-clustered
+ index and we must also fetch the
+ clustered index record; this is the
+ case if the non-clustered record does
+ not contain all the needed columns, or
+ if this is a single-table explicit
+ cursor, or a searched update or
+ delete */
+ ulint* clust_map; /*!< map telling how clust_ref is built
+ from the fields of a non-clustered
+ record */
+ dtuple_t* clust_ref; /*!< the reference to the clustered
+ index entry is built here if index is
+ a non-clustered index */
+ btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use
+ this pcur to search the clustered
+ index */
+ mem_heap_t* old_vers_heap; /*!< memory heap used in building an old
+ version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+ SEL_NODE_CLOSED, /*!< it is a declared cursor which is not
+ currently open */
+ SEL_NODE_OPEN, /*!< intention locks not yet set on tables */
+ SEL_NODE_FETCH, /*!< intention locks have been set */
+ SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_SELECT */
+ enum sel_node_state
+ state; /*!< node state */
+ que_node_t* select_list; /*!< select list */
+ sym_node_t* into_list; /*!< variables list or NULL */
+ sym_node_t* table_list; /*!< table list */
+ ibool asc; /*!< TRUE if the rows should be fetched
+ in an ascending order */
+ ibool set_x_locks; /*!< TRUE if the cursor is for update or
+ delete, which means that a row x-lock
+ should be placed on the cursor row */
+ lock_mode row_lock_mode; /*!< LOCK_X or LOCK_S */
+ ulint n_tables; /*!< number of tables */
+ ulint fetch_table; /*!< number of the next table to access
+ in the join */
+ plan_t* plans; /*!< array of n_tables many plan nodes
+ containing the search plan and the
+ search data structures */
+ que_node_t* search_cond; /*!< search condition */
+ ReadView* read_view; /*!< if the query is a non-locking
+ consistent read, its read view is
+ placed here, otherwise NULL */
+ ibool consistent_read;/*!< TRUE if the select is a consistent,
+ non-locking read */
+ order_node_t* order_by; /*!< order by column definition, or
+ NULL */
+ ibool is_aggregate; /*!< TRUE if the select list consists of
+ aggregate functions */
+ ibool aggregate_already_fetched;
+ /*!< TRUE if the aggregate row has
+ already been fetched for the current
+ cursor */
+ ibool can_get_updated;/*!< this is TRUE if the select
+ is in a single-table explicit
+ cursor which can get updated
+ within the stored procedure,
+ or in a searched update or
+ delete; NOTE that to determine
+ of an explicit cursor if it
+ can get updated, the parser
+ checks from a stored procedure
+ if it contains positioned
+ update or delete statements */
+ sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */
+ UT_LIST_BASE_NODE_T(sym_node_t)
+ copy_variables; /*!< variables whose values we have to
+ copy when an explicit cursor is opened,
+ so that they do not change between
+ fetches */
+};
+
+/**
+Get the plan node for a table in a join.
+@param node query graph node for SELECT
+@param i plan node element
+@return ith plan node */
+inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i)
+{
+ ut_ad(i < node->n_tables);
+ return &node->plans[i];
+}
+
+/** Fetch statement node */
+struct fetch_node_t{
+ que_common_t common; /*!< type: QUE_NODE_FETCH */
+ sel_node_t* cursor_def; /*!< cursor definition */
+ sym_node_t* into_list; /*!< variables to set */
+
+ pars_user_func_t*
+ func; /*!< User callback function or NULL.
+ The first argument to the function
+ is a sel_node_t*, containing the
+ results of the SELECT operation for
+ one row. If the function returns
+ NULL, it is not interested in
+ further rows and the cursor is
+ modified so (cursor % NOTFOUND) is
+ true. If it returns not-NULL,
+ continue normally. */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+ ROW_SEL_OPEN_CURSOR, /*!< open cursor */
+ ROW_SEL_CLOSE_CURSOR /*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_t{
+ que_common_t common; /*!< type: QUE_NODE_OPEN */
+ enum open_node_op
+ op_type; /*!< operation type: open or
+ close cursor */
+ sel_node_t* cursor_def; /*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */
+ sel_node_t* sel_node; /*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+ ROW_SEL_NEXT = 1, /*!< ascending direction */
+ ROW_SEL_PREV = 2 /*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+ ROW_SEL_EXACT = 1, /*!< search using a complete key value */
+ ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which
+ must match rows: the prefix may
+ contain an incomplete field (the last
+ field in prefix may be just a prefix
+ of a fixed length column) */
+};
+
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+ row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+ row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+
+void
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
+ byte* dest, /*!< in/out: buffer where to store; NOTE
+ that BLOBs are not in themselves
+ stored here: the caller must allocate
+ and copy the BLOB into buffer before,
+ and pass the pointer to the BLOB in
+ 'data' */
+ const mysql_row_templ_t* templ,
+ /*!< in: MySQL column template.
+ Its following fields are referenced:
+ type, is_unsigned, mysql_col_len,
+ mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+ const dict_index_t* index,
+ /*!< in: InnoDB index */
+ ulint field_no,
+ /*!< in: templ->rec_field_no or
+ templ->clust_rec_field_no or
+ templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
+ const byte* data, /*!< in: data to store */
+ ulint len); /*!< in: length of the data */
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
new file mode 100644
index 00000000..5e737c1c
--- /dev/null
+++ b/storage/innobase/include/row0types.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+struct plan_t;
+
+struct upd_t;
+struct upd_field_t;
+struct upd_node_t;
+struct del_node_t;
+struct ins_node_t;
+struct sel_node_t;
+struct open_node_t;
+struct fetch_node_t;
+
+struct row_printf_node_t;
+struct sel_buf_t;
+
+struct undo_node_t;
+
+struct purge_node_t;
+
+struct row_ext_t;
+
+/** Buffer for logging modifications during online index creation */
+struct row_log_t;
+
+/* MySQL data types */
+struct TABLE;
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
new file mode 100644
index 00000000..a9877969
--- /dev/null
+++ b/storage/innobase/include/row0uins.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS */
+dberr_t
+row_undo_ins(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
new file mode 100644
index 00000000..5032e103
--- /dev/null
+++ b/storage/innobase/include/row0umod.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
new file mode 100644
index 00000000..ae067a8a
--- /dev/null
+++ b/storage/innobase/include/row0undo.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node) /*!< in/out: row undo node */
+ MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+ If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Undo node structure */
+struct undo_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_UNDO */
+ bool is_temp;/*!< whether this is a temporary table */
+ trx_t* trx; /*!< trx for which undo is done */
+ roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/*!< undo log record */
+ undo_no_t undo_no;/*!< undo number of the record */
+ byte rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ trx_id_t new_trx_id; /*!< trx id to restore to clustered index
+ record */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+ dict_table_t* table; /*!< table where undo is done */
+ ulint cmpl_info;/*!< compiler analysis of an update */
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ const dtuple_t* ref; /*!< row reference to the next row to handle */
+ dtuple_t* row; /*!< a copy (also fields copied to heap) of the
+ row to handle */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns of the row */
+ dtuple_t* undo_row;/*!< NULL, or the row after undo */
+ row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally
+ stored columns of undo_row */
+ dict_index_t* index; /*!< the next index whose record should be
+ handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after undo is tried
+ on a row */
+};
+
+#endif
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
new file mode 100644
index 00000000..f60fc359
--- /dev/null
+++ b/storage/innobase/include/row0upd.h
@@ -0,0 +1,559 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "data0data.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "trx0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "pars0types.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap); /*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update); /*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n); /*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ uint16_t field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index);
+
+/** set field number to a update vector field, marks this field is updated
+@param[in,out] upd_field update vector field
+@param[in] field_no virtual column sequence num
+@param[in] index index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+ upd_field_t* upd_field,
+ uint16_t field_no,
+ dict_index_t* index);
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ uint16_t no, /*!< in: field_no */
+ bool is_virtual) /*!< in: if it is a virtual column */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update);/*!< in: update vector */
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ const rec_t* rec, /*!< in: secondary index record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in] index clustered index
+@param[in] entry clustered index entry to insert
+@param[in] rec clustered index record
+@param[in] offsets rec_get_offsets(rec,index), or NULL
+@param[in] no_sys skip the system columns
+ DB_TRX_ID and DB_ROLL_PTR
+@param[in] ignore_warnings ignore warnings during vcol calculation, which
+ means that this calculation is internal only
+@param[in] trx transaction (for diagnostics),
+ or NULL
+@param[in] heap memory heap from which allocated
+@param[in,out] mysql_table NULL, or mysql table object when
+ user thread invokes dml
+@param[out] error error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+upd_t*
+row_upd_build_difference_binary(
+ dict_index_t* index,
+ const dtuple_t* entry,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ bool no_sys,
+ bool ignore_warnings,
+ trx_t* trx,
+ mem_heap_t* heap,
+ TABLE* mysql_table,
+ dberr_t* error)
+ MY_ATTRIBUTE((nonnull(1,2,3,8,10), warn_unused_result));
+/** Apply an update vector to an index entry.
+@param[in,out] entry index entry to be updated; the clustered index record
+ must be covered by a lock or a page latch to prevent
+ deletion (rollback or purge)
+@param[in] index index of the entry
+@param[in] update update vector built for the entry
+@param[in,out] heap memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull));
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry clustered index tuple where the values are replaced
+ (the clustered index leaf page latch must be held)
+@param index clustered index
+@param update update vector for the clustered index
+@param heap memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+ const upd_t *update, mem_heap_t *heap)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap); /*!< in: memory heap */
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out] row dtuple whose column to be updated
+@param[in] table table
+@param[in] update an update vector built for the clustered index
+@param[in] upd_new update to new or old value
+@param[in,out] undo_row undo row (if needs to be updated)
+@param[in] ptr remaining part in update undo log */
+void
+row_upd_replace_vcol(
+ dtuple_t* row,
+ const dict_table_t* table,
+ const upd_t* update,
+ bool upd_new,
+ dtuple_t* undo_row,
+ const byte* ptr);
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update, /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ const row_ext_t*ext, /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+ MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+#ifdef UNIV_DEBUG
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \
+ row_upd_changes_ord_field_binary_func(index,update,thr,row,ext,0)
+#else /* UNIV_DEBUG */
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \
+ row_upd_changes_ord_field_binary_func(index,update,row,ext,0)
+#endif /* UNIV_DEBUG */
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field); /*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether Doc ID column is affected */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field) /*!< in: field to check */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* Update vector field */
+struct upd_field_t{
+ uint16_t field_no; /*!< field number in an index, usually
+ the clustered index, but in updating
+ a secondary index record in btr0cur.cc
+ this is the position in the secondary
+ index. If this field is a virtual
+ column, then field_no represents
+ the nth virtual column in the table */
+ uint16_t orig_len; /*!< original length of the locally
+ stored part of an externally stored
+ column, or 0 */
+ que_node_t* exp; /*!< expression for calculating a new
+ value: it refers to column values and
+ constants in the symbol table of the
+ query graph */
+ dfield_t new_val; /*!< new value for the column */
+ dfield_t* old_v_val; /*!< old value for the virtual column */
+};
+
+
+/* check whether an update field is on virtual column */
+#define upd_fld_is_virtual_col(upd_fld) \
+ (((upd_fld)->new_val.type.prtype & DATA_VIRTUAL) == DATA_VIRTUAL)
+
+/* set DATA_VIRTUAL bit on update field to show it is a virtual column */
+#define upd_fld_set_virtual_col(upd_fld) \
+ ((upd_fld)->new_val.type.prtype |= DATA_VIRTUAL)
+
+/* Update vector structure */
+struct upd_t{
+ mem_heap_t* heap; /*!< heap from which memory allocated */
+ byte info_bits; /*!< new value of info bits to record;
+ default is 0 */
+ dtuple_t* old_vrow; /*!< pointer to old row, used for
+ virtual column update now */
+ ulint n_fields; /*!< number of update fields */
+ upd_field_t* fields; /*!< array of update fields */
+ byte vers_sys_value[8]; /*!< buffer for updating system fields */
+
+ /** Append an update field to the end of array
+ @param[in] field an update field */
+ void append(const upd_field_t& field)
+ {
+ fields[n_fields++] = field;
+ }
+
+ void remove_element(ulint i)
+ {
+ ut_ad(n_fields > 0);
+ ut_ad(i < n_fields);
+ while (i < n_fields - 1)
+ {
+ fields[i]= fields[i + 1];
+ i++;
+ }
+ n_fields--;
+ }
+
+ bool remove(const ulint field_no)
+ {
+ for (ulint i= 0; i < n_fields; ++i)
+ {
+ if (field_no == fields[i].field_no)
+ {
+ remove_element(i);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /** Determine if the given field_no is modified.
+ @return true if modified, false otherwise. */
+ bool is_modified(uint16_t field_no) const
+ {
+ for (ulint i = 0; i < n_fields; ++i) {
+ if (field_no == fields[i].field_no) {
+ return(true);
+ }
+ }
+ return(false);
+ }
+
+ /** Determine if the update affects a system versioned column or row_end. */
+ bool affects_versioned() const
+ {
+ for (ulint i = 0; i < n_fields; i++) {
+ dtype_t type = fields[i].new_val.type;
+ if (type.is_versioned()) {
+ return true;
+ }
+ // versioned DELETE is UPDATE SET row_end=NOW
+ if (type.vers_sys_end()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /** @return whether this is for a hidden metadata record
+ for instant ALTER TABLE */
+ bool is_metadata() const { return dtuple_t::is_metadata(info_bits); }
+ /** @return whether this is for a hidden metadata record
+ for instant ALTER TABLE (not only ADD COLUMN) */
+ bool is_alter_metadata() const
+ { return dtuple_t::is_alter_metadata(info_bits); }
+
+#ifdef UNIV_DEBUG
+ bool validate() const
+ {
+ for (ulint i = 0; i < n_fields; ++i) {
+ dfield_t* field = &fields[i].new_val;
+ if (dfield_is_ext(field)) {
+ ut_ad(dfield_get_len(field)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ return(true);
+ }
+#endif // UNIV_DEBUG
+};
+
+/** Kinds of update operation */
+enum delete_mode_t {
+ NO_DELETE = 0, /*!< this operation does not delete */
+ PLAIN_DELETE, /*!< ordinary delete */
+ VERSIONED_DELETE /*!< update old and insert a new row */
+};
+
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_UPDATE */
+ delete_mode_t is_delete; /*!< kind of DELETE */
+ ibool searched_update;
+ /* TRUE if searched update, FALSE if
+ positioned */
+ bool in_mysql_interface;
+ /* whether the update node was created
+ for the MySQL interface */
+ dict_foreign_t* foreign;/* NULL or pointer to a foreign key
+ constraint if this update node is used in
+ doing an ON DELETE or ON UPDATE operation */
+ upd_node_t* cascade_node;/* NULL or an update node template which
+ is used to implement ON DELETE/UPDATE CASCADE
+ or ... SET NULL for foreign keys */
+ mem_heap_t* cascade_heap;
+ /*!< NULL or a mem heap where cascade
+ node is created.*/
+ sel_node_t* select; /*!< query graph subtree implementing a base
+ table cursor: the rows returned will be
+ updated */
+ btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered
+ index record which should be updated or
+ deleted; the cursor is stored in the graph
+ of 'select' field above, except in the case
+ of the MySQL interface */
+ dict_table_t* table; /*!< table where updated */
+ upd_t* update; /*!< update vector for the row */
+ ulint update_n_fields;
+ /* when this struct is used to implement
+ a cascade operation for foreign keys, we store
+ here the size of the buffer allocated for use
+ as the update vector */
+ sym_node_list_t columns;/* symbol table nodes for the columns
+ to retrieve from the table */
+ ibool has_clust_rec_x_lock;
+ /* TRUE if the select which retrieves the
+ records to update already sets an x-lock on
+ the clustered record; note that it must always
+ set at least an s-lock */
+ ulint cmpl_info;/* information extracted during query
+ compilation; speeds up execution:
+ UPD_NODE_NO_ORD_CHANGE and
+ UPD_NODE_NO_SIZE_CHANGE, ORed */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be updated */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the row to update; this must be reset
+ to NULL after a successful update */
+ dtuple_t* historical_row; /*!< historical row used in
+ CASCADE UPDATE/SET NULL;
+ allocated from historical_heap */
+ mem_heap_t* historical_heap; /*!< heap for historical row insertion;
+ created when row to update is located;
+ freed right before row update */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ dtuple_t* upd_row;/* NULL, or a copy of the updated row */
+ row_ext_t* upd_ext;/* NULL, or prefixes of the externally
+ stored columns in upd_row */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage;
+ this must be emptied after a successful
+ update */
+ /*----------------------*/
+ sym_node_t* table_sym;/* table node in symbol table */
+ que_node_t* col_assign_list;
+ /* column assignment list */
+ ulint magic_n;
+
+private:
+ /** Appends row_start or row_end field to update vector and sets a
+ CURRENT_TIMESTAMP/trx->id value to it.
+ Supposed to be called only by make_versioned_update() and
+ make_versioned_delete().
+ @param[in] trx transaction
+ @param[in] vers_sys_idx table->row_start or table->row_end */
+ void vers_update_fields(const trx_t *trx, ulint idx);
+
+public:
+ /** Also set row_start = CURRENT_TIMESTAMP/trx->id
+ @param[in] trx transaction */
+ void vers_make_update(const trx_t *trx)
+ {
+ vers_update_fields(trx, table->vers_start);
+ }
+
+ /** Prepare update vector for versioned delete.
+ Set row_end to CURRENT_TIMESTAMP or trx->id.
+ Initialize fts_next_doc_id for versioned delete.
+ @param[in] trx transaction */
+ void vers_make_delete(trx_t *trx);
+};
+
+#define UPD_NODE_MAGIC_N 1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from
+ a node above and if the field
+ has_clust_rec_x_lock is FALSE, we
+ should set an intention x-lock on
+ the table */
+#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be
+ updated */
+#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be
+ inserted, old record is already delete
+ marked */
+#define UPD_NODE_UPDATE_ALL_SEC 5 /* an ordering field of the clustered
+ index record was changed, or this is
+ a delete operation: should update
+ all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC 6 /* secondary index entries should be
+ looked at and updated if an ordering
+ field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
+ changed in the update and no ordering
+ field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be
+ changed in the update */
+
+
+#include "row0upd.inl"
+
+#endif
diff --git a/storage/innobase/include/row0upd.inl b/storage/innobase/include/row0upd.inl
new file mode 100644
index 00000000..13aacf3f
--- /dev/null
+++ b/storage/innobase/include/row0upd.inl
@@ -0,0 +1,153 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "row0row.h"
+#include "lock0lock.h"
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap) /*!< in: heap from which memory allocated */
+{
+ upd_t* update;
+
+ update = static_cast<upd_t*>(mem_heap_zalloc(
+ heap, sizeof(upd_t) + sizeof(upd_field_t) * n));
+
+ update->n_fields = n;
+ update->fields = reinterpret_cast<upd_field_t*>(&update[1]);
+ update->heap = heap;
+
+ return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update) /*!< in: update vector */
+{
+ ut_ad(update);
+
+ return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n) /*!< in: field position in update vector */
+{
+ ut_ad(update);
+ ut_ad(n < update->n_fields);
+
+ return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ uint16_t field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index) /*!< in: index */
+{
+ upd_field->field_no = field_no;
+ upd_field->orig_len = 0;
+ dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+ dfield_get_type(&upd_field->new_val));
+}
+
+/** set field number to a update vector field, marks this field is updated.
+@param[in,out] upd_field update vector field
+@param[in] field_no virtual column sequence num
+@param[in] index index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+ upd_field_t* upd_field,
+ uint16_t field_no,
+ dict_index_t* index)
+{
+ ut_a(field_no < dict_table_get_n_v_cols(index->table));
+ upd_field->field_no = field_no;
+ upd_field->orig_len = 0;
+
+ dict_col_copy_type(&dict_table_get_nth_v_col(
+ index->table, field_no)->m_col,
+ dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ uint16_t no, /*!< in: field_no */
+ bool is_virtual) /*!< in: if it is virtual column */
+{
+ ulint i;
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+
+ /* matches only if the field matches that of is_virtual */
+ if ((!is_virtual) != (!upd_fld_is_virtual_col(uf))) {
+ continue;
+ }
+
+ if (uf->field_no == no) {
+
+ return(uf);
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
new file mode 100644
index 00000000..60f310e1
--- /dev/null
+++ b/storage/innobase/include/row0vers.h
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "row0types.h"
+
+// Forward declaration
+class ReadView;
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out] caller_trx trx of current thread
+@param[in] rec secondary index record
+@param[in] index secondary index
+@param[in] offsets rec_get_offsets(rec, index)
+@return the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+ trx_t* caller_trx,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets);
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in] also_curr TRUE if also rec is included in the versions
+ to search; otherwise only versions prior
+ to it are searched
+@param[in] rec record in the clustered index; the caller
+ must have a latch on the page
+@param[in] mtr mtr holding the latch on rec; it will
+ also hold the latch on purge_view
+@param[in] index secondary index
+@param[in] ientry secondary index entry
+@param[in] roll_ptr roll_ptr for the purge record
+@param[in] trx_id transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+ bool also_curr,
+ const rec_t* rec,
+ mtr_t* mtr,
+ dict_index_t* index,
+ const dtuple_t* ientry,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id);
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ ReadView* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers,/*!< out, own: old version, or NULL
+ if the history is missing or the record
+ does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow); /*!< out: reports virtual column info if any */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ trx_t* caller_trx,/*!<in/out: trx of current thread */
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers,/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow); /*!< out: holds virtual column info if any
+ is updated in the view */
+
+#endif
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
new file mode 100644
index 00000000..4881f2f1
--- /dev/null
+++ b/storage/innobase/include/rw_lock.h
@@ -0,0 +1,138 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include <atomic>
+#include "my_dbug.h"
+
+/** Simple read-write lock based on std::atomic */
+class rw_lock
+{
+ /** The lock word */
+ std::atomic<uint32_t> lock;
+
+protected:
+ /** Available lock */
+ static constexpr uint32_t UNLOCKED= 0;
+ /** Flag to indicate that write_lock() is being held */
+ static constexpr uint32_t WRITER= 1U << 31;
+ /** Flag to indicate that write_lock_wait() is pending */
+ static constexpr uint32_t WRITER_WAITING= 1U << 30;
+ /** Flag to indicate that write_lock() or write_lock_wait() is pending */
+ static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING;
+
+ /** Start waiting for an exclusive lock. */
+ void write_lock_wait_start()
+ {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+ __asm__ __volatile__("lock btsl $30, %0" : "+m" (lock));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+ _interlockedbittestandset(reinterpret_cast<volatile long*>(&lock), 30);
+#else
+ lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
+#endif
+ }
+ /** Start waiting for an exclusive lock.
+ @return current value of the lock word */
+ uint32_t write_lock_wait_start_read()
+ { return lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
+ /** Wait for an exclusive lock.
+ @param l the value of the lock word
+ @return whether the exclusive lock was acquired */
+ bool write_lock_wait_try(uint32_t &l)
+ {
+ return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+ /** Try to acquire a shared lock.
+ @param l the value of the lock word
+ @return whether the lock was acquired */
+ bool read_trylock(uint32_t &l)
+ {
+ l= UNLOCKED;
+ while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire,
+ std::memory_order_relaxed))
+ {
+ DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l));
+ if (l & WRITER_PENDING)
+ return false;
+ }
+ return true;
+ }
+
+ /** Wait for an exclusive lock.
+ @return whether the exclusive lock was acquired */
+ bool write_lock_poll()
+ {
+ auto l= WRITER_WAITING;
+ if (write_lock_wait_try(l))
+ return true;
+ if (!(l & WRITER_WAITING))
+ /* write_lock() must have succeeded for another thread */
+ write_lock_wait_start();
+ return false;
+ }
+ /** @return the lock word value */
+ uint32_t value() const { return lock.load(std::memory_order_acquire); }
+
+public:
+ /** Default constructor */
+ rw_lock() : lock(UNLOCKED) {}
+
+ /** Release a shared lock.
+ @return whether any writers may have to be woken up */
+ bool read_unlock()
+ {
+ auto l= lock.fetch_sub(1, std::memory_order_release);
+ DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+ DBUG_ASSERT(~(WRITER_PENDING) & l); /* at least one read lock */
+ return (~WRITER_PENDING & l) == 1;
+ }
+ /** Release an exclusive lock */
+ void write_unlock()
+ {
+ /* Below, we use fetch_sub(WRITER) instead of fetch_and(~WRITER).
+ The reason is that on IA-32 and AMD64 it translates into the 80486
+ instruction LOCK XADD, while fetch_and() translates into a loop
+ around LOCK CMPXCHG. For other ISA either form should be fine. */
+ static_assert(WRITER == 1U << 31, "compatibility");
+ IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
+ /* the write lock must have existed */
+ DBUG_ASSERT(l & WRITER);
+ }
+ /** Try to acquire a shared lock.
+ @return whether the lock was acquired */
+ bool read_trylock() { uint32_t l; return read_trylock(l); }
+ /** Try to acquire an exclusive lock.
+ @return whether the lock was acquired */
+ bool write_trylock()
+ {
+ auto l= UNLOCKED;
+ return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ /** @return whether an exclusive lock is being held by any thread */
+ bool is_write_locked() const { return !!(value() & WRITER); }
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const { return value() != 0; }
+ /** @return whether any lock is being held by any thread */
+ bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; }
+};
diff --git a/storage/innobase/include/small_vector.h b/storage/innobase/include/small_vector.h
new file mode 100644
index 00000000..d28a3618
--- /dev/null
+++ b/storage/innobase/include/small_vector.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+/* A normally small vector, inspired by llvm::SmallVector */
+#include "my_global.h"
+#include <iterator>
+#include <memory>
+
+class small_vector_base
+{
+protected:
+ typedef uint32_t Size_T;
+ void *BeginX;
+ Size_T Size= 0, Capacity;
+ small_vector_base()= delete;
+ small_vector_base(void *small, size_t small_size)
+ : BeginX(small), Capacity(Size_T(small_size)) {}
+ ATTRIBUTE_COLD void grow_by_1(void *small, size_t element_size);
+public:
+ size_t size() const { return Size; }
+ size_t capacity() const { return Capacity; }
+ bool empty() const { return !Size; }
+ void clear() { Size= 0; }
+protected:
+ void set_size(size_t N) { Size= Size_T(N); }
+};
+
+template <typename T, unsigned N>
+class small_vector : public small_vector_base
+{
+ /** The fixed storage allocation */
+ T small[N];
+
+ using small_vector_base::set_size;
+
+ void grow_if_needed()
+ {
+ if (unlikely(size() >= capacity()))
+ grow_by_1(small, sizeof *small);
+ }
+
+public:
+ small_vector() : small_vector_base(small, N)
+ {
+ TRASH_ALLOC(small, sizeof small);
+ }
+ ~small_vector()
+ {
+ if (small != begin())
+ my_free(begin());
+ MEM_MAKE_ADDRESSABLE(small, sizeof small);
+ }
+
+ using iterator= T *;
+ using const_iterator= const T *;
+ using reverse_iterator= std::reverse_iterator<iterator>;
+ using reference= T &;
+ using const_reference= const T&;
+
+ iterator begin() { return static_cast<iterator>(BeginX); }
+ const_iterator begin() const { return static_cast<const_iterator>(BeginX); }
+ iterator end() { return begin() + size(); }
+ const_iterator end() const { return begin() + size(); }
+
+ reverse_iterator rbegin() { return reverse_iterator(end()); }
+ reverse_iterator rend() { return reverse_iterator(begin()); }
+
+ reference operator[](size_t i) { assert(i < size()); return begin()[i]; }
+ const_reference operator[](size_t i) const
+ { return const_cast<small_vector&>(*this)[i]; }
+
+ void erase(const_iterator S, const_iterator E)
+ {
+ set_size(std::move(const_cast<iterator>(E), end(),
+ const_cast<iterator>(S)) - begin());
+ }
+
+ void emplace_back(T &&arg)
+ {
+ grow_if_needed();
+ ::new (end()) T(arg);
+ set_size(size() + 1);
+ }
+};
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
new file mode 100644
index 00000000..51f3049b
--- /dev/null
+++ b/storage/innobase/include/srv0mon.h
@@ -0,0 +1,846 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009 Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+
+#ifndef __STDC_LIMIT_MACROS
+/* Required for FreeBSD so that INT64_MAX is defined. */
+#define __STDC_LIMIT_MACROS
+#endif /* __STDC_LIMIT_MACROS */
+
+#include <cstdint>
+#include "my_atomic.h"
+#include "my_atomic_wrapper.h"
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+ MONITOR_STARTED = 1, /*!< Monitor has been turned on */
+ MONITOR_STOPPED = 2 /*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status monitor_running_t;
+
+/** Monitor counter value type */
+typedef int64_t mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_t {
+ time_t mon_start_time; /*!< Start time of monitoring */
+ time_t mon_stop_time; /*!< Stop time of monitoring */
+ time_t mon_reset_time; /*!< Time of resetting the counter */
+ mon_type_t mon_value; /*!< Current counter Value */
+ mon_type_t mon_max_value; /*!< Current Max value */
+ mon_type_t mon_min_value; /*!< Current Min value */
+ mon_type_t mon_value_reset;/*!< value at last reset */
+ mon_type_t mon_max_value_start; /*!< Max value since start */
+ mon_type_t mon_min_value_start; /*!< Min value since start */
+ mon_type_t mon_start_value;/*!< Value at the start time */
+ mon_type_t mon_last_value; /*!< Last set of values */
+ monitor_running_t mon_status; /* whether monitor still running */
+};
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_t {
+ MONITOR_NONE = 0, /*!< No monitoring */
+ MONITOR_MODULE = 1, /*!< This is a monitor module type,
+ not a counter */
+ MONITOR_EXISTING = 2, /*!< The monitor carries information from
+ an existing system status variable */
+ MONITOR_NO_AVERAGE = 4, /*!< Set this status if we don't want to
+ calculate the average value for the counter */
+ MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+ counter, rather than incremental value
+ over the period. Mostly for counters
+ displaying current resource usage */
+ MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+ only as a module, but not individually */
+ MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+ server start up */
+ MONITOR_SET_OWNER = 64, /*!< Owner of "monitor set", a set of
+ monitor counters */
+ MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+ MONITOR_HIDDEN = 256 /*!< Do not display this monitor in the
+ metrics table */
+};
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (int64_t) */
+#ifndef INT64_MAX
+#define INT64_MAX (9223372036854775807LL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN (-9223372036854775807LL-1)
+#endif
+#define MIN_RESERVED INT64_MAX
+#define MAX_RESERVED INT64_MIN
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_t {
+ /* This is to identify the default value set by the metrics
+ control global variables */
+ MONITOR_DEFAULT_START = 0,
+
+ /* Start of Metadata counter */
+ MONITOR_MODULE_METADATA,
+ MONITOR_TABLE_OPEN,
+
+ /* Lock manager related counters */
+ MONITOR_MODULE_LOCK,
+ MONITOR_DEADLOCK,
+ MONITOR_TIMEOUT,
+ MONITOR_LOCKREC_WAIT,
+ MONITOR_TABLELOCK_WAIT,
+ MONITOR_NUM_RECLOCK_REQ,
+ MONITOR_RECLOCK_CREATED,
+ MONITOR_RECLOCK_REMOVED,
+ MONITOR_NUM_RECLOCK,
+ MONITOR_TABLELOCK_CREATED,
+ MONITOR_TABLELOCK_REMOVED,
+ MONITOR_NUM_TABLELOCK,
+ MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+ MONITOR_OVLD_LOCK_WAIT_TIME,
+ MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+ MONITOR_OVLD_ROW_LOCK_WAIT,
+ MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+ /* Buffer and I/O realted counters. */
+ MONITOR_MODULE_BUFFER,
+ MONITOR_OVLD_BUFFER_POOL_SIZE,
+ MONITOR_OVLD_BUF_POOL_READS,
+ MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+ MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+ MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+ MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+ MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+ MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+ MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+ MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+ MONITOR_OVLD_BUF_POOL_BYTES_DATA,
+ MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+ MONITOR_OVLD_BUF_POOL_BYTES_DIRTY,
+ MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+ MONITOR_OVLD_PAGE_CREATED,
+ MONITOR_OVLD_PAGES_WRITTEN,
+ MONITOR_OVLD_PAGES_READ,
+ MONITOR_OVLD_BYTE_READ,
+ MONITOR_OVLD_BYTE_WRITTEN,
+ MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+ MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_PAGES,
+ MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
+
+ MONITOR_FLUSH_N_TO_FLUSH_BY_AGE,
+ MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+
+ MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+
+ MONITOR_LRU_GET_FREE_LOOPS,
+ MONITOR_LRU_GET_FREE_WAITS,
+
+ MONITOR_FLUSH_AVG_PAGE_RATE,
+ MONITOR_FLUSH_LSN_AVG_RATE,
+ MONITOR_FLUSH_PCT_FOR_DIRTY,
+ MONITOR_FLUSH_PCT_FOR_LSN,
+ MONITOR_FLUSH_SYNC_WAITS,
+ MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_PAGES,
+ MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES,
+ MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_PAGES,
+ MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+ MONITOR_LRU_GET_FREE_SEARCH,
+ MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+ /* Buffer Page I/O specific counters. */
+ MONITOR_MODULE_BUF_PAGE,
+ MONITOR_INDEX_LEAF_PAGE_READ,
+ MONITOR_INDEX_NON_LEAF_PAGE_READ,
+ MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+ MONITOR_UNDO_LOG_PAGE_READ,
+ MONITOR_INODE_PAGE_READ,
+ MONITOR_IBUF_FREELIST_PAGE_READ,
+ MONITOR_IBUF_BITMAP_PAGE_READ,
+ MONITOR_SYSTEM_PAGE_READ,
+ MONITOR_TRX_SYSTEM_PAGE_READ,
+ MONITOR_FSP_HDR_PAGE_READ,
+ MONITOR_XDES_PAGE_READ,
+ MONITOR_BLOB_PAGE_READ,
+ MONITOR_ZBLOB_PAGE_READ,
+ MONITOR_ZBLOB2_PAGE_READ,
+ MONITOR_OTHER_PAGE_READ,
+ MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+ MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+ MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+ MONITOR_UNDO_LOG_PAGE_WRITTEN,
+ MONITOR_INODE_PAGE_WRITTEN,
+ MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+ MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+ MONITOR_SYSTEM_PAGE_WRITTEN,
+ MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+ MONITOR_FSP_HDR_PAGE_WRITTEN,
+ MONITOR_XDES_PAGE_WRITTEN,
+ MONITOR_BLOB_PAGE_WRITTEN,
+ MONITOR_ZBLOB_PAGE_WRITTEN,
+ MONITOR_ZBLOB2_PAGE_WRITTEN,
+ MONITOR_OTHER_PAGE_WRITTEN,
+
+ /* OS level counters (I/O) */
+ MONITOR_MODULE_OS,
+ MONITOR_OVLD_OS_FILE_READ,
+ MONITOR_OVLD_OS_FILE_WRITE,
+ MONITOR_OVLD_OS_FSYNC,
+ MONITOR_OS_PENDING_READS,
+ MONITOR_OS_PENDING_WRITES,
+ MONITOR_OVLD_OS_LOG_WRITTEN,
+
+ /* Transaction related counters */
+ MONITOR_MODULE_TRX,
+ MONITOR_TRX_RW_COMMIT,
+ MONITOR_TRX_RO_COMMIT,
+ MONITOR_TRX_NL_RO_COMMIT,
+ MONITOR_TRX_COMMIT_UNDO,
+ MONITOR_TRX_ROLLBACK,
+ MONITOR_TRX_ROLLBACK_SAVEPOINT,
+ MONITOR_RSEG_HISTORY_LEN,
+ MONITOR_NUM_UNDO_SLOT_USED,
+ MONITOR_NUM_UNDO_SLOT_CACHED,
+ MONITOR_RSEG_CUR_SIZE,
+
+ /* Purge related counters */
+ MONITOR_MODULE_PURGE,
+ MONITOR_N_DEL_ROW_PURGE,
+ MONITOR_N_UPD_EXIST_EXTERN,
+ MONITOR_PURGE_INVOKED,
+ MONITOR_PURGE_N_PAGE_HANDLED,
+ MONITOR_DML_PURGE_DELAY,
+ MONITOR_PURGE_STOP_COUNT,
+ MONITOR_PURGE_RESUME_COUNT,
+
+ /* Recovery related counters */
+ MONITOR_MODULE_RECOVERY,
+ MONITOR_OVLD_CHECKPOINTS,
+ MONITOR_OVLD_LSN_FLUSHDISK,
+ MONITOR_OVLD_LSN_CHECKPOINT,
+ MONITOR_OVLD_LSN_CURRENT,
+ MONITOR_LSN_CHECKPOINT_AGE,
+ MONITOR_OVLD_BUF_OLDEST_LSN,
+ MONITOR_OVLD_MAX_AGE_ASYNC,
+ MONITOR_OVLD_LOG_WAITS,
+ MONITOR_OVLD_LOG_WRITE_REQUEST,
+ MONITOR_OVLD_LOG_WRITES,
+
+ /* Page Manager related counters */
+ MONITOR_MODULE_PAGE,
+ MONITOR_PAGE_COMPRESS,
+ MONITOR_PAGE_DECOMPRESS,
+ MONITOR_PAD_INCREMENTS,
+ MONITOR_PAD_DECREMENTS,
+ /* New monitor variables for page compression */
+ MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+ MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+ MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR,
+
+ /* New monitor variables for page encryption */
+ MONITOR_OVLD_PAGES_ENCRYPTED,
+ MONITOR_OVLD_PAGES_DECRYPTED,
+
+ /* Index related counters */
+ MONITOR_MODULE_INDEX,
+ MONITOR_INDEX_SPLIT,
+ MONITOR_INDEX_MERGE_ATTEMPTS,
+ MONITOR_INDEX_MERGE_SUCCESSFUL,
+ MONITOR_INDEX_REORG_ATTEMPTS,
+ MONITOR_INDEX_REORG_SUCCESSFUL,
+ MONITOR_INDEX_DISCARD,
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* Adaptive Hash Index related counters */
+ MONITOR_MODULE_ADAPTIVE_HASH,
+ MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+ MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+ MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+ MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+ MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+ MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+ MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+ MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* Tablespace related counters */
+ MONITOR_MODULE_FIL_SYSTEM,
+ MONITOR_OVLD_N_FILE_OPENED,
+
+ /* InnoDB Change Buffer related counters */
+ MONITOR_MODULE_IBUF_SYSTEM,
+ MONITOR_OVLD_IBUF_MERGE_INSERT,
+ MONITOR_OVLD_IBUF_MERGE_DELETE,
+ MONITOR_OVLD_IBUF_MERGE_PURGE,
+ MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+ MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+ MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+ MONITOR_OVLD_IBUF_MERGES,
+ MONITOR_OVLD_IBUF_SIZE,
+
+ /* Counters for server operations */
+ MONITOR_MODULE_SERVER,
+ MONITOR_MASTER_THREAD_SLEEP,
+ MONITOR_OVLD_SERVER_ACTIVITY,
+ MONITOR_MASTER_ACTIVE_LOOPS,
+ MONITOR_MASTER_IDLE_LOOPS,
+ MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+ MONITOR_SRV_DICT_LRU_MICROSECOND,
+ MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
+ MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE,
+ MONITOR_OVLD_SRV_DBLWR_WRITES,
+ MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+ MONITOR_OVLD_SRV_PAGE_SIZE,
+
+ /* Data DDL related counters */
+ MONITOR_MODULE_DDL_STATS,
+ MONITOR_BACKGROUND_DROP_INDEX,
+ MONITOR_ONLINE_CREATE_INDEX,
+ MONITOR_PENDING_ALTER_TABLE,
+ MONITOR_ALTER_TABLE_SORT_FILES,
+ MONITOR_ALTER_TABLE_LOG_FILES,
+
+ MONITOR_MODULE_ICP,
+ MONITOR_ICP_ATTEMPTS,
+ MONITOR_ICP_NO_MATCH,
+ MONITOR_ICP_OUT_OF_RANGE,
+ MONITOR_ICP_MATCH,
+
+ /* This is used only for control system to turn
+ on/off and reset all monitor counters */
+ MONITOR_ALL_COUNTER,
+
+ /* This must be the last member */
+ NUM_MONITOR
+};
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define MONITOR_NO_MATCH (NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_t {
+ const char* monitor_name; /*!< Monitor name */
+ const char* monitor_module; /*!< Sub Module the monitor
+ belongs to */
+ const char* monitor_desc; /*!< Brief desc of monitor counter */
+ monitor_type_t monitor_type; /*!< Type of Monitor Info */
+ monitor_id_t monitor_related_id;/*!< Monitor ID of counter that
+ related to this monitor. This is
+ set when the monitor belongs to
+ a "monitor set" */
+ monitor_id_t monitor_id; /*!< Monitor ID as defined in enum
+ monitor_id_t */
+};
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_option_t {
+ MONITOR_TURN_ON = 1, /*!< Turn on the counter */
+ MONITOR_TURN_OFF, /*!< Turn off the counter */
+ MONITOR_RESET_VALUE, /*!< Reset current values */
+ MONITOR_RESET_ALL_VALUE, /*!< Reset all values */
+ MONITOR_GET_VALUE /*!< Option for
+ srv_mon_process_existing_counter()
+ function */
+};
+
+/** Number of bit in a ulint datatype */
+#define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern Atomic_relaxed<ulint>
+ monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor) \
+ (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_or( \
+ (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+#define MONITOR_OFF(monitor) \
+ (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_and( \
+ ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor) \
+ (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] & \
+ (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field) \
+ (innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor) \
+ MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor) \
+ MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor) \
+ MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor) \
+ (MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor) \
+ MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor) \
+ do { \
+ MONITOR_STATUS(monitor) = MONITOR_STARTED; \
+ MONITOR_FIELD((monitor), mon_start_time) = time(NULL); \
+ } while (0)
+
+#define MONITOR_SET_OFF(monitor) \
+ do { \
+ MONITOR_STATUS(monitor) = MONITOR_STOPPED; \
+ MONITOR_FIELD((monitor), mon_stop_time) = time(NULL); \
+ } while (0)
+
+#define MONITOR_INIT_ZERO_VALUE 0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor) \
+ (MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE \
+ && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+ && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor) \
+ if (MONITOR_MAX_MIN_NOT_INIT(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \
+ MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \
+ MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \
+ MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \
+ }
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define MONITOR_INC(monitor) \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor)++; \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor monitor to be incremented by 1
+@param enabled whether the monitor is enabled */
+#define MONITOR_ATOMIC_INC_LOW(monitor, enabled) \
+ if (enabled) { \
+ ib_uint64_t value; \
+ value = my_atomic_add64_explicit( \
+ (int64*) &MONITOR_VALUE(monitor), 1, \
+ MY_MEMORY_ORDER_RELAXED) + 1; \
+ /* Note: This is not 100% accurate because of the \
+ inherent race, we ignore it due to performance. */ \
+ if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = value; \
+ } \
+ }
+
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor monitor to be decremented by 1
+@param enabled whether the monitor is enabled */
+#define MONITOR_ATOMIC_DEC_LOW(monitor, enabled) \
+ if (enabled) { \
+ ib_uint64_t value; \
+ value = my_atomic_add64_explicit( \
+ (int64*) &MONITOR_VALUE(monitor), -1, \
+ MY_MEMORY_ORDER_RELAXED) - 1; \
+ /* Note: This is not 100% accurate because of the \
+ inherent race, we ignore it due to performance. */ \
+ if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = value; \
+ } \
+ }
+
+/** Atomically increment a monitor counter if it is enabled.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor monitor to be incremented by 1 */
+#define MONITOR_ATOMIC_INC(monitor) \
+ MONITOR_ATOMIC_INC_LOW(monitor, MONITOR_IS_ON(monitor))
+/** Atomically decrement a monitor counter if it is enabled.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor monitor to be decremented by 1 */
+#define MONITOR_ATOMIC_DEC(monitor) \
+ MONITOR_ATOMIC_DEC_LOW(monitor, MONITOR_IS_ON(monitor))
+
+#define MONITOR_DEC(monitor) \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor)--; \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+#ifdef HAVE_MEM_CHECK
+# define MONITOR_CHECK_DEFINED(value) do { \
+ mon_type_t m __attribute__((unused))= value; \
+ MEM_CHECK_DEFINED(&m, sizeof m); \
+} while (0)
+#else /* HAVE_MEM_CHECK */
+# define MONITOR_CHECK_DEFINED(value) (void) 0
+#endif /* HAVE_MEM_CHECK */
+
+#define MONITOR_INC_VALUE(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) += (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+#define MONITOR_DEC_VALUE(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \
+ MONITOR_VALUE(monitor) -= (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define MONITOR_INC_NOCHECK(monitor) \
+ do { \
+ MONITOR_VALUE(monitor)++; \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ } while (0) \
+
+#define MONITOR_DEC_NOCHECK(monitor) \
+ do { \
+ MONITOR_VALUE(monitor)--; \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ } while (0)
+
+/** Directly set a monitor counter's value */
+#define MONITOR_SET(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) = (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@param monitor monitor to update for the time difference
+@param value the start time value */
+#define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ uintmax_t old_time = value; \
+ value = microsecond_interval_timer(); \
+ MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+ }
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@param monitor the main monitor counter to update. It accounts for
+ the accumulative value for the counter.
+@param monitor_n_calls counter that counts number of times this macro is
+ called
+@param monitor_per_call counter that records the current and max value of
+ each incremental value
+@param value incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE( \
+ monitor, monitor_n_calls, monitor_per_call, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor_n_calls)++; \
+ MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor_per_call) \
+ > MONITOR_MAX_VALUE(monitor_per_call)) { \
+ MONITOR_MAX_VALUE(monitor_per_call) = \
+ (mon_type_t) (value); \
+ } \
+ MONITOR_VALUE(monitor) += (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) = (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) = (mon_type_t) (value); \
+ }
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor) \
+ do { \
+ MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \
+ MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \
+ MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \
+ MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \
+ MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_FIELD(monitor, mon_start_time) = \
+ MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_FIELD(monitor, mon_stop_time) = \
+ MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_FIELD(monitor, mon_reset_time) = \
+ MONITOR_INIT_ZERO_VALUE; \
+ } while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value) do { \
+ MONITOR_CHECK_DEFINED(value); \
+ (MONITOR_START_VALUE(monitor) = \
+ (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \
+ } while (0)
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor) \
+ do { \
+ MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor); \
+ MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor); \
+ } while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value) \
+ MONITOR_SET_UPD_MAX_ONLY(monitor, ((value) \
+ - MONITOR_VALUE_RESET(monitor) \
+ - MONITOR_FIELD(monitor, mon_start_value) \
+ + MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+ monitor_id_t monitor_id); /*!< id index into the
+ innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return corresponding monitor name, or NULL if no such
+monitor */
+const char*
+srv_mon_get_name(
+/*=============*/
+ monitor_id_t monitor_id); /*!< id index into the
+ innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return 0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+void
+srv_mon_set_module_control(
+/*=======================*/
+ monitor_id_t module_id, /*!< in: Module ID as in
+ monitor_counter_id. If it is
+ set to NUM_MONITOR, this means
+ we shall turn on all the counters */
+ mon_option_t set_option); /*!< in: Turn on/off reset the
+ counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+ monitor_id_t monitor_id, /*!< in: the monitor's ID as in
+ monitor_counter_id */
+ mon_option_t set_option); /*!< in: Turn on/off reset the
+ counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+ monitor_id_t monitor); /*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+ monitor_id_t monitor); /*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+void
+srv_mon_reset(
+/*==========*/
+ monitor_id_t monitor); /*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+ monitor_id_t monitor); /*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#include "srv0mon.inl"
+
+#endif
diff --git a/storage/innobase/include/srv0mon.inl b/storage/innobase/include/srv0mon.inl
new file mode 100644
index 00000000..158345b2
--- /dev/null
+++ b/storage/innobase/include/srv0mon.inl
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010 Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+ /* MONITOR_MAX_VALUE_START has not yet been
+ initialized, the max value since start is the
+ max count in MONITOR_MAX_VALUE */
+ MONITOR_MAX_VALUE_START(monitor) =
+ MONITOR_MAX_VALUE(monitor);
+
+ } else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+ && (MONITOR_MAX_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor)
+ > MONITOR_MAX_VALUE_START(monitor))) {
+
+ /* If the max value since reset (as specified
+ in MONITOR_MAX_VALUE) plus the reset value is
+ larger than MONITOR_MAX_VALUE_START, reset
+ MONITOR_MAX_VALUE_START to this new max value */
+ MONITOR_MAX_VALUE_START(monitor) =
+ MONITOR_MAX_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor);
+ }
+
+ return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+ /* MONITOR_MIN_VALUE_START has not yet been
+ initialized, the min value since start is the
+ min count in MONITOR_MIN_VALUE */
+ MONITOR_MIN_VALUE_START(monitor) =
+ MONITOR_MIN_VALUE(monitor);
+
+ } else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+ && (MONITOR_MIN_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor)
+ < MONITOR_MIN_VALUE_START(monitor))) {
+
+ /* If the min value since reset (as specified
+ in MONITOR_MIN_VALUE) plus the reset value is
+ less than MONITOR_MIN_VALUE_START, reset
+ MONITOR_MIN_VALUE_START to this new min value */
+ MONITOR_MIN_VALUE_START(monitor) =
+ MONITOR_MIN_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor);
+ }
+
+ return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ /* Do not reset all counter values if monitor is still on. */
+ if (MONITOR_IS_ON(monitor)) {
+ fprintf(stderr, "InnoDB: Cannot reset all values for"
+ " monitor counter %s while it is on. Please"
+ " turn it off and retry.\n",
+ srv_mon_get_name(monitor));
+ } else {
+ MONITOR_RESET_ALL(monitor);
+ }
+}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
new file mode 100644
index 00000000..db846795
--- /dev/null
+++ b/storage/innobase/include/srv0srv.h
@@ -0,0 +1,715 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "fil0fil.h"
+#include "ut0counter.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+#include <tpool.h>
+#include <memory>
+
+/** Simple non-atomic counter
+@tparam Type the integer type of the counter */
+template <typename Type>
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter
+{
+ /** Increment the counter */
+ Type inc() { return add(1); }
+ /** Decrement the counter */
+ Type dec() { return add(Type(~0)); }
+
+ /** Add to the counter
+ @param i amount to be added
+ @return the value of the counter after adding */
+ Type add(Type i) { return m_counter += i; }
+
+ /** @return the value of the counter */
+ operator Type() const { return m_counter; }
+
+private:
+ /** The counter */
+ Type m_counter;
+};
+
+/** Global counters used inside InnoDB. */
+struct srv_stats_t
+{
+ typedef ib_counter_t<ulint> ulint_ctr_n_t;
+ typedef simple_counter<lsn_t> lsn_ctr_1_t;
+ typedef simple_counter<ulint> ulint_ctr_1_t;
+ typedef simple_counter<int64_t> int64_ctr_1_t;
+
+ /** Count the amount of data written in total (in bytes) */
+ ulint_ctr_1_t data_written;
+ /** Number of bytes saved by page compression */
+ ulint_ctr_n_t page_compression_saved;
+ /* Number of pages compressed with page compression */
+ ulint_ctr_n_t pages_page_compressed;
+ /* Number of TRIM operations induced by page compression */
+ ulint_ctr_n_t page_compressed_trim_op;
+ /* Number of pages decompressed with page compression */
+ ulint_ctr_n_t pages_page_decompressed;
+ /* Number of page compression errors */
+ ulint_ctr_n_t pages_page_compression_error;
+ /* Number of pages encrypted */
+ ulint_ctr_n_t pages_encrypted;
+ /* Number of pages decrypted */
+ ulint_ctr_n_t pages_decrypted;
+ /* Number of merge blocks encrypted */
+ ulint_ctr_n_t n_merge_blocks_encrypted;
+ /* Number of merge blocks decrypted */
+ ulint_ctr_n_t n_merge_blocks_decrypted;
+ /* Number of row log blocks encrypted */
+ ulint_ctr_n_t n_rowlog_blocks_encrypted;
+ /* Number of row log blocks decrypted */
+ ulint_ctr_n_t n_rowlog_blocks_decrypted;
+
+ /** Number of data read in total (in bytes) */
+ ulint_ctr_1_t data_read;
+
+ /** Number of encryption_get_latest_key_version calls */
+ ulint_ctr_n_t n_key_requests;
+
+ /** Number of temporary tablespace blocks encrypted */
+ ulint_ctr_n_t n_temp_blocks_encrypted;
+
+ /** Number of temporary tablespace blocks decrypted */
+ ulint_ctr_n_t n_temp_blocks_decrypted;
+};
+
+/** We are prepared for a situation that we have this many threads waiting for
+a transactional lock inside InnoDB. srv_start() sets the value. */
+extern ulint srv_max_n_threads;
+
+extern const char* srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char srv_mysql50_table_name_prefix[10];
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT "ib_buffer_pool"
+extern char* srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char srv_buffer_pool_dump_at_shutdown;
+extern char srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char srv_disable_sort_file_cache;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment())
+
+/** Mutex protecting page_zip_stat_per_index */
+extern mysql_mutex_t page_zip_stat_per_index_mutex;
+/** Mutex for locking srv_monitor_file */
+extern mysql_mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE* srv_monitor_file;
+/** Mutex for locking srv_misc_tmpfile */
+extern mysql_mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE* srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char* srv_data_home;
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+extern my_bool srv_read_only_mode;
+/** Set if InnoDB operates in read-only mode or innodb-force-recovery
+is greater than SRV_FORCE_NO_IBUF_MERGE. */
+extern my_bool high_level_read_only;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+extern my_bool srv_file_per_table;
+
+/** Sort buffer size in index creation */
+extern ulong srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+extern unsigned long long srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio.
+Currently we support native aio on windows and linux */
+extern my_bool srv_use_native_aio;
+extern my_bool srv_numa_interleave;
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* Compression algorithm*/
+extern ulong innodb_compression_algorithm;
+
+/** TRUE if the server was successfully started */
+extern bool srv_was_started;
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char* srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern uint srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are active (hosting some rollback
+segment). It is quite possible that some of the tablespaces doesn't host
+any of the rollback-segment based on configuration used. */
+extern uint32_t srv_undo_tablespaces_active;
+
+/** Maximum size of undo tablespace. */
+extern unsigned long long srv_max_undo_log_size;
+
+extern uint srv_n_fil_crypt_threads;
+extern uint srv_n_fil_crypt_threads_started;
+
+/** Rate at which UNDO records should be purged. */
+extern ulong srv_purge_rseg_truncate_frequency;
+
+/** Enable or Disable Truncate of UNDO tablespace. */
+extern my_bool srv_undo_log_truncate;
+
+/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
+constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
+ UNIV_PAGE_SIZE_DEF;
+
+extern char* srv_log_group_home_dir;
+
+/** The InnoDB redo log file size, or 0 when changing the redo log format
+at startup (while disallowing writes to the redo log). */
+extern ulonglong srv_log_file_size;
+extern ulong srv_flush_log_at_trx_commit;
+extern uint srv_flush_log_at_timeout;
+extern my_bool srv_adaptive_flushing;
+extern my_bool srv_flush_sync;
+
+/** Requested size in bytes */
+extern ulint srv_buf_pool_size;
+/** Requested buffer pool chunk size */
+extern size_t srv_buf_pool_chunk_unit;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+extern ulong srv_LRU_scan_depth;
+/** Whether or not to flush neighbors of a block */
+extern ulong srv_flush_neighbors;
+/** Previously requested size */
+extern ulint srv_buf_pool_old_size;
+/** Current size as scaling factor for the other components */
+extern ulint srv_buf_pool_base_size;
+/** Current size in bytes */
+extern ulint srv_buf_pool_curr_size;
+/** Dump this % of each buffer pool during BP dump */
+extern ulong srv_buf_pool_dump_pct;
+#ifdef UNIV_DEBUG
+/** Abort load after this amount of pages */
+extern ulong srv_buf_pool_load_pages_abort;
+#endif
+/** Lock table size in bytes */
+extern ulint srv_lock_table_size;
+
+/** the value of innodb_checksum_algorithm */
+extern ulong srv_checksum_algorithm;
+extern my_bool srv_random_read_ahead;
+extern ulong srv_read_ahead_threshold;
+extern uint srv_n_read_io_threads;
+extern uint srv_n_write_io_threads;
+
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool srv_defragment;
+extern uint srv_defragment_n_pages;
+extern uint srv_defragment_stats_accuracy;
+extern uint srv_defragment_fill_factor_n_recs;
+extern double srv_defragment_fill_factor;
+extern uint srv_defragment_frequency;
+extern ulonglong srv_defragment_interval;
+
+extern uint srv_change_buffer_max_size;
+
+/* Number of IO operations per second the server can do */
+extern ulong srv_io_capacity;
+
+/* We use this dummy default value at startup for max_io_capacity.
+The real value is set based on the value of io_capacity. */
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL)
+#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL)
+extern ulong srv_max_io_capacity;
+
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong srv_innodb_stats_method;
+
+extern ulint srv_max_n_open_files;
+
+extern double srv_max_buf_pool_modified_pct;
+extern double srv_max_dirty_pages_pct_lwm;
+
+extern double srv_adaptive_flushing_lwm;
+extern ulong srv_flushing_avg_loops;
+
+extern ulong srv_force_recovery;
+
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+extern uint srv_fast_shutdown;
+
+extern ibool srv_innodb_status;
+
+extern unsigned long long srv_stats_transient_sample_pages;
+extern my_bool srv_stats_persistent;
+extern unsigned long long srv_stats_persistent_sample_pages;
+extern my_bool srv_stats_auto_recalc;
+extern my_bool srv_stats_include_delete_marked;
+extern unsigned long long srv_stats_modified_counter;
+extern my_bool srv_stats_sample_traditional;
+
+extern my_bool srv_use_doublewrite_buf;
+extern ulong srv_checksum_algorithm;
+
+extern my_bool srv_force_primary_key;
+
+extern ulong srv_max_purge_lag;
+extern ulong srv_max_purge_lag_delay;
+
+extern my_bool innodb_encrypt_temporary_tables;
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+/*-------------------------------------------*/
+
+/** Modes of operation */
+enum srv_operation_mode {
+ /** Normal mode (MariaDB Server) */
+ SRV_OPERATION_NORMAL,
+ /** Mariabackup is executing server to export already restored
+ tablespaces */
+ SRV_OPERATION_EXPORT_RESTORED,
+ /** Mariabackup taking a backup */
+ SRV_OPERATION_BACKUP,
+ /** Mariabackup restoring a backup for subsequent --copy-back */
+ SRV_OPERATION_RESTORE,
+ /** Mariabackup restoring the incremental part of a backup */
+ SRV_OPERATION_RESTORE_DELTA,
+ /** Mariabackup restoring a backup for subsequent --export */
+ SRV_OPERATION_RESTORE_EXPORT,
+ /** Mariabackup taking a backup and avoid deferring
+ any tablespace */
+ SRV_OPERATION_BACKUP_NO_DEFER
+};
+
+/** Current mode of operation */
+extern enum srv_operation_mode srv_operation;
+
+/** whether this is the server's first start after mariabackup --prepare */
+extern bool srv_start_after_restore;
+
+extern my_bool srv_print_innodb_monitor;
+extern my_bool srv_print_innodb_lock_monitor;
+extern ibool srv_print_verbose_log;
+
+extern bool srv_monitor_active;
+
+
+extern ulong srv_n_spin_wait_rounds;
+extern uint srv_spin_wait_delay;
+
+/** Number of initialized rollback segments for persistent undo log */
+extern ulong srv_available_undo_logs;
+/** Iterations of the loop bounded by 'srv_active' label. */
+extern ulint srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+extern ulint srv_main_idle_loops;
+/** Log writes involving flush. */
+extern ulint srv_log_writes_and_flush;
+
+#ifdef UNIV_DEBUG
+extern my_bool innodb_evict_tables_on_commit_debug;
+extern my_bool srv_purge_view_update_only_debug;
+
+/** InnoDB system tablespace to set during recovery */
+extern uint srv_sys_space_size_debug;
+/** whether redo log file has been created at startup */
+extern bool srv_log_file_created;
+#endif /* UNIV_DEBUG */
+
+extern ulint srv_dml_needed_delay;
+
+/** innodb_purge_threads; the number of purge tasks to use */
+extern uint srv_n_purge_threads;
+
+/* the number of pages to purge in one batch */
+extern ulong srv_purge_batch_size;
+
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
+
+extern my_bool srv_cmp_per_index_enabled;
+
+/** innodb_encrypt_log */
+extern my_bool srv_encrypt_log;
+
+/* is encryption enabled */
+extern ulong srv_encrypt_tables;
+
+
+/** Status variables to be passed to MySQL */
+extern struct export_var_t export_vars;
+
+/** Global counters */
+extern srv_stats_t srv_stats;
+
+/** Fatal semaphore wait threshold = maximum number of seconds
+that semaphore times out in InnoDB */
+#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600
+extern ulong srv_fatal_semaphore_wait_threshold;
+
+/** Buffer pool dump status frequence in percentages */
+extern ulong srv_buf_dump_status_frequency;
+
+# ifdef UNIV_PFS_THREAD
+extern mysql_pfs_key_t page_cleaner_thread_key;
+extern mysql_pfs_key_t trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t thread_pool_thread_key;
+
+/* This macro register the current thread and its key with performance
+schema */
+# define pfs_register_thread(key) \
+do { \
+ struct PSI_thread* psi __attribute__((unused)) \
+ = PSI_CALL_new_thread(key, NULL, 0); \
+ PSI_CALL_set_thread_os_id(psi); \
+ PSI_CALL_set_thread(psi); \
+} while (0)
+
+/* This macro delist the current thread from performance schema */
+# define pfs_delete_thread() \
+do { \
+ PSI_CALL_delete_current_thread(); \
+} while (0)
+# else
+# define pfs_register_thread(key)
+# define pfs_delete_thread()
+# endif /* UNIV_PFS_THREAD */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Performance schema stage event for monitoring ALTER TABLE progress
+in ha_innobase::commit_inplace_alter_table(). */
+extern PSI_stage_info srv_stage_alter_table_end;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_insert_index_tuples(). */
+extern PSI_stage_info srv_stage_alter_table_insert;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_apply(). */
+extern PSI_stage_info srv_stage_alter_table_log_index;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_table_apply(). */
+extern PSI_stage_info srv_stage_alter_table_log_table;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_sort(). */
+extern PSI_stage_info srv_stage_alter_table_merge_sort;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_read_clustered_index(). */
+extern PSI_stage_info srv_stage_alter_table_read_pk_internal_sort;
+
+/** Performance schema stage event for monitoring buffer pool load progress. */
+extern PSI_stage_info srv_stage_buffer_pool_load;
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+ SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it
+ detects a corrupt page */
+ SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from
+ running: if a crash would occur
+ in purge, this prevents it */
+ SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run DML rollback after
+ recovery */
+ SRV_FORCE_NO_DDL_UNDO = 4, /*!< prevent also DDL rollback */
+ SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when
+ starting the database: InnoDB will
+ treat even incomplete transactions
+ as committed */
+ SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward
+ in connection with recovery */
+};
+
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+ SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as
+ equal. This is the default setting
+ for innodb_stats_method */
+ SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as
+ NOT equal. */
+ SRV_STATS_NULLS_IGNORED /* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum srv_stats_method_name_t;
+
+/*********************************************************************//**
+Boots Innobase server. */
+void
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+void
+srv_free(void);
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for lock_sys.latch */
+ ulint* trx_start, /*!< out: file position of the start of
+ the list of active transactions */
+ ulint* trx_end); /*!< out: file position of the end of
+ the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+void
+srv_export_innodb_status(void);
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count.
+@return activity count. */
+ulint
+srv_get_activity_count(void);
+/*========================*/
+
+/******************************************************************//**
+Increment the server activity counter. */
+void
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity();
+#endif
+
+extern "C" {
+
+
+/** Periodic task which prints the info output by various InnoDB monitors.*/
+void srv_monitor_task(void*);
+
+
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*);
+
+
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge);
+
+} /* extern "C" */
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length();
+#endif
+
+/** Shut down the purge threads. */
+void srv_purge_shutdown();
+
+/** Init purge tasks*/
+void srv_init_purge_tasks();
+
+/** Status variables to be passed to MySQL */
+struct export_var_t{
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint innodb_ahi_hit;
+ ulint innodb_ahi_miss;
+#endif /* BTR_CUR_HASH_ADAPT */
+ char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
+ char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
+ char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
+ my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
+ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */
+ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */
+ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */
+#ifdef UNIV_DEBUG
+ ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */
+#endif /* UNIV_DEBUG */
+ /** buf_pool.stat.n_page_gets (a sharded counter) */
+ ulint innodb_buffer_pool_read_requests;
+ ulint innodb_checkpoint_age;
+ ulint innodb_checkpoint_max_age;
+ ulint innodb_data_pending_reads; /*!< Pending reads */
+ ulint innodb_data_pending_writes; /*!< Pending writes */
+ ulint innodb_data_read; /*!< Data bytes read */
+ ulint innodb_data_writes; /*!< I/O write requests */
+ ulint innodb_data_written; /*!< Data bytes written */
+ ulint innodb_data_reads; /*!< I/O read requests */
+ ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */
+ ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */
+ ulint innodb_deadlocks;
+ ulint innodb_history_list_length;
+ lsn_t innodb_lsn_current;
+ lsn_t innodb_lsn_flushed;
+ lsn_t innodb_lsn_last_checkpoint;
+ trx_id_t innodb_max_trx_id;
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint innodb_mem_adaptive_hash;
+#endif
+ ulint innodb_mem_dictionary;
+ /** log_sys.get_lsn() - recv_sys.lsn */
+ lsn_t innodb_os_log_written;
+ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */
+ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */
+ int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time
+ / 1000 */
+ uint64_t innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time
+ / srv_n_lock_wait_count */
+ uint64_t innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time */
+
+ /** Number of undo tablespace truncation operations */
+ ulong innodb_undo_truncations;
+ ulint innodb_defragment_compression_failures; /*!< Number of
+ defragment re-compression
+ failures */
+
+ ulint innodb_defragment_failures; /*!< Number of defragment
+ failures*/
+ ulint innodb_defragment_count; /*!< Number of defragment
+ operations*/
+
+ /** Number of instant ALTER TABLE operations that affect columns */
+ ulong innodb_instant_alter_column;
+
+ ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */
+ ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage
+ of used row log buffer */
+ ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */
+
+ int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+ by page compression */
+ int64_t innodb_pages_page_compressed;/*!< Number of pages
+ compressed by page compression */
+ int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+ induced by page compression */
+ int64_t innodb_pages_page_decompressed;/*!< Number of pages
+ decompressed by page
+ compression */
+ int64_t innodb_pages_page_compression_error;/*!< Number of page
+ compression errors */
+ int64_t innodb_pages_encrypted; /*!< Number of pages
+ encrypted */
+ int64_t innodb_pages_decrypted; /*!< Number of pages
+ decrypted */
+
+ /*!< Number of merge blocks encrypted */
+ ib_int64_t innodb_n_merge_blocks_encrypted;
+ /*!< Number of merge blocks decrypted */
+ ib_int64_t innodb_n_merge_blocks_decrypted;
+ /*!< Number of row log blocks encrypted */
+ ib_int64_t innodb_n_rowlog_blocks_encrypted;
+ /*!< Number of row log blocks decrypted */
+ ib_int64_t innodb_n_rowlog_blocks_decrypted;
+
+ /* Number of temporary tablespace pages encrypted */
+ ib_int64_t innodb_n_temp_blocks_encrypted;
+
+ /* Number of temporary tablespace pages decrypted */
+ ib_int64_t innodb_n_temp_blocks_decrypted;
+
+ ulint innodb_encryption_rotation_pages_read_from_cache;
+ ulint innodb_encryption_rotation_pages_read_from_disk;
+ ulint innodb_encryption_rotation_pages_modified;
+ ulint innodb_encryption_rotation_pages_flushed;
+ ulint innodb_encryption_rotation_estimated_iops;
+ int64_t innodb_encryption_key_requests;
+};
+
+extern tpool::thread_pool *srv_thread_pool;
+extern std::unique_ptr<tpool::timer> srv_master_timer;
+extern std::unique_ptr<tpool::timer> srv_monitor_timer;
+
+/** The interval at which srv_monitor_task is invoked, in milliseconds */
+constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */
+
+static inline void srv_monitor_timer_schedule_now()
+{
+ srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL);
+}
+static inline void srv_start_periodic_timer(std::unique_ptr<tpool::timer>& t,
+ void (*func)(void*), int period)
+{
+ t.reset(srv_thread_pool->create_timer(func));
+ t->set_time(0, period);
+}
+
+void srv_thread_pool_init();
+void srv_thread_pool_end();
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
new file mode 100644
index 00000000..c18cf1ce
--- /dev/null
+++ b/storage/innobase/include/srv0start.h
@@ -0,0 +1,124 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "ut0byte.h"
+
+// Forward declaration
+struct dict_table_t;
+
+/** Open the configured number of dedicated undo tablespaces.
+@param[in] create_new_undo whether the undo tablespaces has to be created
+@param[in,out] mtr mini-transaction
+@return DB_SUCCESS or error code */
+dberr_t srv_undo_tablespaces_init(bool create_new_undo, mtr_t *mtr);
+
+/** Start InnoDB.
+@param[in] create_new_db whether to create a new database
+@return DB_SUCCESS or error code */
+dberr_t srv_start(bool create_new_db);
+
+/**
+ Shutdown purge to make sure that there is no possibility that we call any
+ plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown();
+
+/** Shut down InnoDB. */
+void innodb_shutdown();
+
+/*************************************************************//**
+Copy the file path component of the physical file to parameter. It will
+copy up to and including the terminating path separator.
+@return number of bytes copied or ULINT_UNDEFINED if destination buffer
+ is smaller than the path to be copied. */
+ulint
+srv_path_copy(
+/*==========*/
+ char* dest, /*!< out: destination buffer */
+ ulint dest_len, /*!< in: max bytes to copy */
+ const char* basedir, /*!< in: base directory */
+ const char* table_name) /*!< in: source table name */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get the meta-data filename from the table name for a
+single-table tablespace.
+@param[in] table table object
+@param[out] filename filename
+@param[in] max_len filename max length */
+void
+srv_get_meta_data_filename(
+ dict_table_t* table,
+ char* filename,
+ ulint max_len);
+
+/** Get the encryption-data filename from the table name for a
+single-table tablespace.
+@param[in] table table object
+@param[out] filename filename
+@param[in] max_len filename max length */
+void
+srv_get_encryption_data_filename(
+ dict_table_t* table,
+ char* filename,
+ ulint max_len);
+
+/** Log sequence number at shutdown */
+extern lsn_t srv_shutdown_lsn;
+
+/** TRUE if the server is being started */
+extern bool srv_is_being_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern bool srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern ibool srv_start_raw_disk_in_use;
+
+/** Shutdown state */
+enum srv_shutdown_t {
+ SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */
+ /** Shutdown initiated in srv_shutdown_bg_undo_sources() */
+ SRV_SHUTDOWN_INITIATED,
+ SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in
+ logs_empty_and_mark_files_at_shutdown() */
+ SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+ the buffer pool can be freed: flush
+ all file spaces and close all files */
+ SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** Whether any undo log records can be generated */
+extern bool srv_undo_sources;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern enum srv_shutdown_t srv_shutdown_state;
+
+/** Files comprising the system tablespace */
+extern pfs_os_file_t files[1000];
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
new file mode 100644
index 00000000..1dca0cc1
--- /dev/null
+++ b/storage/innobase/include/srw_lock.h
@@ -0,0 +1,554 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "univ.i"
+#include "rw_lock.h"
+
+#if defined __linux__
+/* futex(2): FUTEX_WAIT_PRIVATE, FUTEX_WAKE_PRIVATE */
+#elif defined __OpenBSD__ || defined __FreeBSD__ || defined __DragonFly__
+/* system calls similar to Linux futex(2) */
+#elif defined _WIN32
+/* SRWLOCK as well as WaitOnAddress(), WakeByAddressSingle() */
+#else
+# define SUX_LOCK_GENERIC /* fall back to generic synchronization primitives */
+#endif
+
+#if !defined SUX_LOCK_GENERIC && 0 /* defined SAFE_MUTEX */
+# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
+#endif
+
+#ifdef SUX_LOCK_GENERIC
+/** An exclusive-only variant of srw_lock */
+template<bool spinloop>
+class pthread_mutex_wrapper final
+{
+ pthread_mutex_t lock;
+public:
+ void init()
+ {
+ if (spinloop)
+ pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);
+ else
+ pthread_mutex_init(&lock, nullptr);
+ }
+ void destroy() { pthread_mutex_destroy(&lock); }
+# ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+ void wr_lock() { pthread_mutex_lock(&lock); }
+# else
+private:
+ void wr_wait();
+public:
+ inline void wr_lock();
+# endif
+ void wr_unlock() { pthread_mutex_unlock(&lock); }
+ bool wr_lock_try() { return !pthread_mutex_trylock(&lock); }
+};
+
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait();
+template<>
+inline void pthread_mutex_wrapper<false>::wr_lock()
+{ pthread_mutex_lock(&lock); }
+template<>
+inline void pthread_mutex_wrapper<true>::wr_lock()
+{ if (!wr_lock_try()) wr_wait(); }
+# endif
+#endif
+
+/** Futex-based mutex */
+template<bool spinloop>
+class srw_mutex_impl final
+{
+ /** The lock word, containing HOLDER + 1 if the lock is being held,
+ plus the number of waiters */
+ std::atomic<uint32_t> lock;
+ /** Identifies that the lock is being held */
+ static constexpr uint32_t HOLDER= 1U << 31;
+
+#ifdef SUX_LOCK_GENERIC
+public:
+ /** The mutex for the condition variables. */
+ pthread_mutex_t mutex;
+private:
+ /** Condition variable for the lock word. Used with mutex. */
+ pthread_cond_t cond;
+#endif
+
+ /** Wait until the mutex has been acquired */
+ void wait_and_lock();
+ /** Wait for lock!=lk */
+ inline void wait(uint32_t lk);
+ /** Wake up one wait() thread */
+ void wake();
+public:
+ /** @return whether the mutex is being held or waited for */
+ bool is_locked_or_waiting() const
+ { return lock.load(std::memory_order_acquire) != 0; }
+ /** @return whether the mutex is being held by any thread */
+ bool is_locked() const
+ { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; }
+
+ void init()
+ {
+ DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+ pthread_mutex_init(&mutex, nullptr);
+ pthread_cond_init(&cond, nullptr);
+#endif
+ }
+ void destroy()
+ {
+ DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+ pthread_mutex_destroy(&mutex);
+ pthread_cond_destroy(&cond);
+#endif
+ }
+
+ /** @return whether the mutex was acquired */
+ bool wr_lock_try()
+ {
+ uint32_t lk= 0;
+ return lock.compare_exchange_strong(lk, HOLDER + 1,
+ std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
+ void wr_unlock()
+ {
+ const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release);
+ if (lk != HOLDER + 1)
+ {
+ DBUG_ASSERT(lk & HOLDER);
+ wake();
+ }
+ }
+};
+
+#ifdef SUX_LOCK_GENERIC
+typedef pthread_mutex_wrapper<true> srw_spin_mutex;
+typedef pthread_mutex_wrapper<false> srw_mutex;
+#else
+typedef srw_mutex_impl<true> srw_spin_mutex;
+typedef srw_mutex_impl<false> srw_mutex;
+#endif
+
+template<bool spinloop> class srw_lock_impl;
+
+/** Slim shared-update-exclusive lock with no recursion */
+template<bool spinloop>
+class ssux_lock_impl final
+{
+#ifdef UNIV_PFS_RWLOCK
+ friend class ssux_lock;
+# ifdef SUX_LOCK_GENERIC
+# elif defined _WIN32
+# else
+ friend srw_lock_impl<spinloop>;
+# endif
+#endif
+ /** mutex for synchronization; held by U or X lock holders */
+ srw_mutex_impl<spinloop> writer;
+#ifdef SUX_LOCK_GENERIC
+ /** Condition variable for "readers"; used with writer.mutex. */
+ pthread_cond_t readers_cond;
+#endif
+ /** S or U holders, and WRITER flag for X holder or waiter */
+ std::atomic<uint32_t> readers;
+ /** indicates an X request; readers=WRITER indicates granted X lock */
+ static constexpr uint32_t WRITER= 1U << 31;
+
+ /** Wait for readers!=lk */
+ inline void wait(uint32_t lk);
+
+ /** Wait for readers!=lk|WRITER */
+ void wr_wait(uint32_t lk);
+ /** Wake up wait() on the last rd_unlock() */
+ void wake();
+ /** Acquire a read lock */
+ void rd_wait();
+public:
+ void init()
+ {
+ writer.init();
+ DBUG_ASSERT(is_vacant());
+#ifdef SUX_LOCK_GENERIC
+ pthread_cond_init(&readers_cond, nullptr);
+#endif
+ }
+ void destroy()
+ {
+ DBUG_ASSERT(is_vacant());
+ writer.destroy();
+#ifdef SUX_LOCK_GENERIC
+ pthread_cond_destroy(&readers_cond);
+#endif
+ }
+ /** @return whether any writer is waiting */
+ bool is_waiting() const
+ { return (readers.load(std::memory_order_relaxed) & WRITER) != 0; }
+#ifndef DBUG_OFF
+ /** @return whether the lock is being held or waited for */
+ bool is_vacant() const { return !is_locked_or_waiting(); }
+#endif /* !DBUG_OFF */
+
+ bool rd_lock_try()
+ {
+ uint32_t lk= 0;
+ while (!readers.compare_exchange_weak(lk, lk + 1,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ if (lk & WRITER)
+ return false;
+ return true;
+ }
+
+ bool u_lock_try()
+ {
+ if (!writer.wr_lock_try())
+ return false;
+ IF_DBUG_ASSERT(uint32_t lk=,)
+ readers.fetch_add(1, std::memory_order_acquire);
+ DBUG_ASSERT(lk < WRITER - 1);
+ return true;
+ }
+
+ bool wr_lock_try()
+ {
+ if (!writer.wr_lock_try())
+ return false;
+ uint32_t lk= 0;
+ if (readers.compare_exchange_strong(lk, WRITER,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ return true;
+ writer.wr_unlock();
+ return false;
+ }
+
+ void rd_lock() { if (!rd_lock_try()) rd_wait(); }
+ void u_lock()
+ {
+ writer.wr_lock();
+ IF_DBUG_ASSERT(uint32_t lk=,)
+ readers.fetch_add(1, std::memory_order_acquire);
+ DBUG_ASSERT(lk < WRITER - 1);
+ }
+ void wr_lock()
+ {
+ writer.wr_lock();
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+ /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
+ as a loop around LOCK CMPXCHG. In this particular case, setting the
+ most significant bit using fetch_add() is equivalent, and is
+ translated into a simple LOCK XADD. */
+ static_assert(WRITER == 1U << 31, "compatibility");
+ if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
+ wr_wait(lk);
+#else
+ if (uint32_t lk= readers.fetch_or(WRITER, std::memory_order_acquire))
+ wr_wait(lk);
+#endif
+ }
+
+ void u_wr_upgrade()
+ {
+ DBUG_ASSERT(writer.is_locked());
+ uint32_t lk= readers.fetch_add(WRITER - 1, std::memory_order_acquire);
+ if (lk != 1)
+ wr_wait(lk - 1);
+ }
+ void wr_u_downgrade()
+ {
+ DBUG_ASSERT(writer.is_locked());
+ DBUG_ASSERT(is_write_locked());
+ readers.store(1, std::memory_order_release);
+ /* Note: Any pending rd_lock() will not be woken up until u_unlock() */
+ }
+
+ void rd_unlock()
+ {
+ uint32_t lk= readers.fetch_sub(1, std::memory_order_release);
+ ut_ad(~WRITER & lk);
+ if (lk == WRITER + 1)
+ wake();
+ }
+ void u_unlock()
+ {
+ IF_DBUG_ASSERT(uint32_t lk=,)
+ readers.fetch_sub(1, std::memory_order_release);
+ DBUG_ASSERT(lk);
+ DBUG_ASSERT(lk < WRITER);
+ writer.wr_unlock();
+ }
+ void wr_unlock()
+ {
+ DBUG_ASSERT(is_write_locked());
+ readers.store(0, std::memory_order_release);
+ writer.wr_unlock();
+ }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept
+ { return readers.load(std::memory_order_acquire) == WRITER; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked() const noexcept
+ { return readers.load(std::memory_order_acquire) != 0; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return is_locked() || writer.is_locked_or_waiting(); }
+
+ void lock_shared() { rd_lock(); }
+ void unlock_shared() { rd_unlock(); }
+ void lock() { wr_lock(); }
+ void unlock() { wr_unlock(); }
+};
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+/** Slim read-write lock */
+template<bool spinloop>
+class srw_lock_
+{
+# ifdef UNIV_PFS_RWLOCK
+ friend srw_lock_impl<spinloop>;
+# endif
+# ifdef _WIN32
+ SRWLOCK lk;
+# else
+ rw_lock_t lk;
+# endif
+
+ void rd_wait();
+ void wr_wait();
+public:
+ void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); }
+ void destroy() { IF_WIN(,rwlock_destroy(&lk)); }
+ inline void rd_lock();
+ inline void wr_lock();
+ bool rd_lock_try()
+ { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); }
+ void rd_unlock()
+ { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); }
+ bool wr_lock_try()
+ { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); }
+ void wr_unlock()
+ { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); }
+#ifdef _WIN32
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked() const noexcept { return is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept
+ {
+ // FIXME: this returns false positives for shared locks
+ return is_locked();
+ }
+
+ void lock_shared() { rd_lock(); }
+ void unlock_shared() { rd_unlock(); }
+ void lock() { wr_lock(); }
+ void unlock() { wr_unlock(); }
+#endif
+};
+
+template<> void srw_lock_<true>::rd_wait();
+template<> void srw_lock_<true>::wr_wait();
+
+template<>
+inline void srw_lock_<false>::rd_lock()
+{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); }
+template<>
+inline void srw_lock_<false>::wr_lock()
+{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); }
+
+template<>
+inline void srw_lock_<true>::rd_lock() { if (!rd_lock_try()) rd_wait(); }
+template<>
+inline void srw_lock_<true>::wr_lock() { if (!wr_lock_try()) wr_wait(); }
+
+typedef srw_lock_<false> srw_lock_low;
+typedef srw_lock_<true> srw_spin_lock_low;
+#else
+typedef ssux_lock_impl<false> srw_lock_low;
+typedef ssux_lock_impl<true> srw_spin_lock_low;
+#endif
+
+#ifndef UNIV_PFS_RWLOCK
+# define SRW_LOCK_INIT(key) init()
+# define SRW_LOCK_ARGS(file, line) /* nothing */
+# define SRW_LOCK_CALL /* nothing */
+typedef srw_lock_low srw_lock;
+typedef srw_spin_lock_low srw_spin_lock;
+#else
+# define SRW_LOCK_INIT(key) init(key)
+# define SRW_LOCK_ARGS(file, line) file, line
+# define SRW_LOCK_CALL __FILE__, __LINE__
+
+/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */
+class ssux_lock
+{
+ PSI_rwlock *pfs_psi;
+ ssux_lock_impl<false> lock;
+
+ ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_u_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_u_wr_upgrade(const char *file, unsigned line);
+public:
+ void init(mysql_pfs_key_t key)
+ {
+ pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+ lock.init();
+ }
+ void destroy()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ {
+ PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+ pfs_psi= nullptr;
+ }
+ lock.destroy();
+ }
+ void rd_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_rd_lock(file, line);
+ else
+ lock.rd_lock();
+ }
+ void rd_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.rd_unlock();
+ }
+ void u_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_u_lock(file, line);
+ else
+ lock.u_lock();
+ }
+ void u_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.u_unlock();
+ }
+ void wr_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_wr_lock(file, line);
+ else
+ lock.wr_lock();
+ }
+ void wr_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.wr_unlock();
+ }
+ void u_wr_upgrade(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_u_wr_upgrade(file, line);
+ else
+ lock.u_wr_upgrade();
+ }
+ bool rd_lock_try() { return lock.rd_lock_try(); }
+ bool u_lock_try() { return lock.u_lock_try(); }
+ bool wr_lock_try() { return lock.wr_lock_try(); }
+ bool is_waiting() const { return lock.is_waiting(); }
+};
+
+/** Slim reader-writer lock with PERFORMANCE_SCHEMA instrumentation */
+template<bool spinloop>
+class srw_lock_impl
+{
+ PSI_rwlock *pfs_psi;
+# if defined _WIN32 || defined SUX_LOCK_GENERIC
+ srw_lock_<spinloop> lock;
+# else
+ ssux_lock_impl<spinloop> lock;
+# endif
+
+ ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+public:
+ void init(mysql_pfs_key_t key)
+ {
+ pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+ lock.init();
+ }
+ void destroy()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ {
+ PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+ pfs_psi= nullptr;
+ }
+ lock.destroy();
+ }
+ void rd_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_rd_lock(file, line);
+ else
+ lock.rd_lock();
+ }
+ void rd_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.rd_unlock();
+ }
+ void wr_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_wr_lock(file, line);
+ else
+ lock.wr_lock();
+ }
+ void wr_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.wr_unlock();
+ }
+ bool rd_lock_try() { return lock.rd_lock_try(); }
+ bool wr_lock_try() { return lock.wr_lock_try(); }
+ void lock_shared() { return rd_lock(SRW_LOCK_CALL); }
+ void unlock_shared() { return rd_unlock(); }
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return lock.is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_locked() const noexcept { return lock.is_locked(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept { return lock.is_write_locked(); }
+#endif
+};
+
+typedef srw_lock_impl<false> srw_lock;
+typedef srw_lock_impl<true> srw_spin_lock;
+
+#endif
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
new file mode 100644
index 00000000..2c0167ac
--- /dev/null
+++ b/storage/innobase/include/sux_lock.h
@@ -0,0 +1,472 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "srw_lock.h"
+#include "my_atomic_wrapper.h"
+#ifdef UNIV_DEBUG
+# include <unordered_set>
+#endif
+
+/** A "fat" rw-lock that supports
+S (shared), U (update, or shared-exclusive), and X (exclusive) modes
+as well as recursive U and X latch acquisition
+@tparam ssux ssux_lock_impl or ssux_lock */
+template<typename ssux>
+class sux_lock final
+{
+ /** The underlying non-recursive lock */
+ ssux lock;
+ /** Numbers of U and X locks. Protected by lock. */
+ uint32_t recursive;
+ /** The owner of the U or X lock (0 if none); protected by lock */
+ std::atomic<pthread_t> writer;
+ /** Special writer!=0 value to indicate that the lock is non-recursive
+ and will be released by an I/O thread */
+#if defined __linux__ || defined _WIN32
+ static constexpr pthread_t FOR_IO= pthread_t(~0UL);
+#else
+# define FOR_IO ((pthread_t) ~0UL) /* it could be a pointer */
+#endif
+#ifdef UNIV_DEBUG
+ /** Protects readers */
+ mutable srw_mutex readers_lock;
+ /** Threads that hold the lock in shared mode */
+ std::atomic<std::unordered_multiset<pthread_t>*> readers;
+#endif
+
+ /** The multiplier in recursive for X locks */
+ static constexpr uint32_t RECURSIVE_X= 1U;
+ /** The multiplier in recursive for U locks */
+ static constexpr uint32_t RECURSIVE_U= 1U << 16;
+ /** The maximum allowed level of recursion */
+ static constexpr uint32_t RECURSIVE_MAX= RECURSIVE_U - 1;
+
+public:
+#ifdef UNIV_PFS_RWLOCK
+ inline void init();
+#endif
+ void SRW_LOCK_INIT(mysql_pfs_key_t key)
+ {
+ lock.SRW_LOCK_INIT(key);
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_ad(!recursive);
+ ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+ if (auto r= readers.load(std::memory_order_relaxed))
+ ut_ad(r->empty());
+#endif
+ }
+
+ /** Free the rw-lock after init() */
+ void free()
+ {
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_ad(!recursive);
+#ifdef UNIV_DEBUG
+ readers_lock.destroy();
+ if (auto r= readers.load(std::memory_order_relaxed))
+ {
+ ut_ad(r->empty());
+ delete r;
+ readers.store(nullptr, std::memory_order_relaxed);
+ }
+#endif
+ lock.destroy();
+ }
+
+ /** needed for dict_index_t::clone() */
+ inline void operator=(const sux_lock&);
+
+#ifdef UNIV_DEBUG
+ /** @return whether no recursive locks are being held */
+ bool not_recursive() const
+ {
+ ut_ad(recursive);
+ return recursive == RECURSIVE_X || recursive == RECURSIVE_U;
+ }
+
+ /** @return the number of X locks being held (by any thread) */
+ unsigned x_lock_count() const { return recursive & RECURSIVE_MAX; }
+#endif
+
+ /** Acquire a recursive lock */
+ template<bool allow_readers> void writer_recurse()
+ {
+ ut_ad(writer == pthread_self());
+ ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+ RECURSIVE_MAX);
+ ut_ad(allow_readers ? recursive : rec);
+ ut_ad(rec < RECURSIVE_MAX);
+ recursive+= allow_readers ? RECURSIVE_U : RECURSIVE_X;
+ }
+
+private:
+ /** Transfer the ownership of a write lock to another thread
+ @param id the new owner of the U or X lock */
+ void set_new_owner(pthread_t id)
+ {
+ IF_DBUG(DBUG_ASSERT(writer.exchange(id, std::memory_order_relaxed)),
+ writer.store(id, std::memory_order_relaxed));
+ }
+ /** Assign the ownership of a write lock to a thread
+ @param id the owner of the U or X lock */
+ void set_first_owner(pthread_t id)
+ {
+ IF_DBUG(DBUG_ASSERT(!writer.exchange(id, std::memory_order_relaxed)),
+ writer.store(id, std::memory_order_relaxed));
+ }
+#ifdef UNIV_DEBUG
+ /** Register the current thread as a holder of a shared lock */
+ void s_lock_register()
+ {
+ const pthread_t id= pthread_self();
+ readers_lock.wr_lock();
+ auto r= readers.load(std::memory_order_relaxed);
+ if (!r)
+ {
+ r= new std::unordered_multiset<pthread_t>();
+ readers.store(r, std::memory_order_relaxed);
+ }
+ r->emplace(id);
+ readers_lock.wr_unlock();
+ }
+#endif
+
+public:
+ /** In crash recovery or the change buffer, claim the ownership
+ of the exclusive block lock to the current thread */
+ void claim_ownership() { set_new_owner(pthread_self()); }
+
+ /** @return whether the current thread is holding X or U latch */
+ bool have_u_or_x() const
+ {
+ if (pthread_self() != writer.load(std::memory_order_relaxed))
+ return false;
+ ut_ad(recursive);
+ return true;
+ }
+ /** @return whether the current thread is holding U but not X latch */
+ bool have_u_not_x() const
+ { return have_u_or_x() && !((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+ /** @return whether the current thread is holding X latch */
+ bool have_x() const
+ { return have_u_or_x() && ((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread is holding S latch */
+ bool have_s() const
+ {
+ if (auto r= readers.load(std::memory_order_relaxed))
+ {
+ readers_lock.wr_lock();
+ bool found= r->find(pthread_self()) != r->end();
+ readers_lock.wr_unlock();
+ return found;
+ }
+ return false;
+ }
+ /** @return whether the current thread is holding the latch */
+ bool have_any() const { return have_u_or_x() || have_s(); }
+#endif
+
+ /** Acquire a shared lock */
+ inline void s_lock();
+ inline void s_lock(const char *file, unsigned line);
+ /** Acquire an update lock */
+ inline void u_lock();
+ inline void u_lock(const char *file, unsigned line);
+ /** Acquire an exclusive lock */
+ inline void x_lock(bool for_io= false);
+ inline void x_lock(const char *file, unsigned line);
+ /** Acquire a recursive exclusive lock */
+ void x_lock_recursive() { writer_recurse<false>(); }
+ /** Upgrade an update lock */
+ inline void u_x_upgrade();
+ inline void u_x_upgrade(const char *file, unsigned line);
+ /** Downgrade a single exclusive lock to an update lock */
+ void x_u_downgrade()
+ {
+ ut_ad(have_u_or_x());
+ ut_ad(recursive <= RECURSIVE_MAX);
+ recursive*= RECURSIVE_U;
+ lock.wr_u_downgrade();
+ }
+
+ /** Acquire an exclusive lock or upgrade an update lock
+ @return whether U locks were upgraded to X */
+ inline bool x_lock_upgraded();
+
+ /** @return whether a shared lock was acquired */
+ bool s_lock_try()
+ {
+ bool acquired= lock.rd_lock_try();
+ ut_d(if (acquired) s_lock_register());
+ return acquired;
+ }
+
+ /** Try to acquire an update lock
+ @param for_io whether the lock will be released by another thread
+ @return whether the update lock was acquired */
+ inline bool u_lock_try(bool for_io);
+
+ /** Try to acquire an exclusive lock
+ @return whether an exclusive lock was acquired */
+ inline bool x_lock_try();
+
+ /** Release a shared lock */
+ void s_unlock()
+ {
+#ifdef UNIV_DEBUG
+ const pthread_t id= pthread_self();
+ auto r= readers.load(std::memory_order_relaxed);
+ ut_ad(r);
+ readers_lock.wr_lock();
+ auto i= r->find(id);
+ ut_ad(i != r->end());
+ r->erase(i);
+ readers_lock.wr_unlock();
+#endif
+ lock.rd_unlock();
+ }
+ /** Release an update or exclusive lock
+ @param allow_readers whether we are releasing a U lock
+ @param claim_ownership whether the lock was acquired by another thread */
+ void u_or_x_unlock(bool allow_readers, bool claim_ownership= false)
+ {
+ ut_d(auto owner= writer.load(std::memory_order_relaxed));
+ ut_ad(owner == pthread_self() ||
+ (owner == FOR_IO && claim_ownership &&
+ recursive == (allow_readers ? RECURSIVE_U : RECURSIVE_X)));
+ ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+ RECURSIVE_MAX);
+ ut_ad(rec);
+ if (!(recursive-= allow_readers ? RECURSIVE_U : RECURSIVE_X))
+ {
+ set_new_owner(0);
+ if (allow_readers)
+ lock.u_unlock();
+ else
+ lock.wr_unlock();
+ }
+ }
+ /** Release an update lock */
+ void u_unlock(bool claim_ownership= false)
+ { u_or_x_unlock(true, claim_ownership); }
+ /** Release an exclusive lock */
+ void x_unlock(bool claim_ownership= false)
+ { u_or_x_unlock(false, claim_ownership); }
+
+ /** @return whether any writer is waiting */
+ bool is_waiting() const { return lock.is_waiting(); }
+
+ bool is_write_locked() const { return lock.is_write_locked(); }
+
+ bool is_locked_or_waiting() const { return lock.is_locked_or_waiting(); }
+
+ inline void lock_shared();
+ inline void unlock_shared();
+};
+
+typedef sux_lock<ssux_lock_impl<true>> block_lock;
+
+#ifndef UNIV_PFS_RWLOCK
+typedef sux_lock<ssux_lock_impl<false>> index_lock;
+#else
+typedef sux_lock<ssux_lock> index_lock;
+
+template<> inline void sux_lock<ssux_lock_impl<true>>::init()
+{
+ lock.init();
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_ad(!recursive);
+ ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+ if (auto r= readers.load(std::memory_order_relaxed))
+ ut_ad(r->empty());
+#endif
+}
+
+template<>
+inline void sux_lock<ssux_lock>::s_lock(const char *file, unsigned line)
+{
+ ut_ad(!have_x());
+ ut_ad(!have_s());
+ lock.rd_lock(file, line);
+ ut_d(s_lock_register());
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_lock(const char *file, unsigned line)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ writer_recurse<true>();
+ else
+ {
+ lock.u_lock(file, line);
+ ut_ad(!recursive);
+ recursive= RECURSIVE_U;
+ set_first_owner(id);
+ }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::x_lock(const char *file, unsigned line)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ writer_recurse<false>();
+ else
+ {
+ lock.wr_lock(file, line);
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(id);
+ }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_x_upgrade(const char *file, unsigned line)
+{
+ ut_ad(have_u_not_x());
+ lock.u_wr_upgrade(file, line);
+ recursive/= RECURSIVE_U;
+}
+#endif
+
+/** needed for dict_index_t::clone() */
+template<> inline void index_lock::operator=(const sux_lock&)
+{
+ memset((void*) this, 0, sizeof *this);
+}
+
+template<typename ssux> inline void sux_lock<ssux>::s_lock()
+{
+ ut_ad(!have_x());
+ ut_ad(!have_s());
+ lock.rd_lock();
+ ut_d(s_lock_register());
+}
+
+template<typename ssux>
+inline void sux_lock<ssux>::lock_shared() { s_lock(); }
+template<typename ssux>
+inline void sux_lock<ssux>::unlock_shared() { s_unlock(); }
+
+template<typename ssux> inline void sux_lock<ssux>::u_lock()
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ writer_recurse<true>();
+ else
+ {
+ lock.u_lock();
+ ut_ad(!recursive);
+ recursive= RECURSIVE_U;
+ set_first_owner(id);
+ }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::x_lock(bool for_io)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ ut_ad(!for_io);
+ writer_recurse<false>();
+ }
+ else
+ {
+ lock.wr_lock();
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(for_io ? FOR_IO : id);
+ }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::u_x_upgrade()
+{
+ ut_ad(have_u_not_x());
+ lock.u_wr_upgrade();
+ recursive/= RECURSIVE_U;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_upgraded()
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ ut_ad(recursive);
+ static_assert(RECURSIVE_X == 1, "compatibility");
+ if (recursive & RECURSIVE_MAX)
+ {
+ writer_recurse<false>();
+ return false;
+ }
+ /* Upgrade the lock. */
+ lock.u_wr_upgrade();
+ recursive/= RECURSIVE_U;
+ return true;
+ }
+ else
+ {
+ lock.wr_lock();
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(id);
+ return false;
+ }
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::u_lock_try(bool for_io)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ if (for_io)
+ return false;
+ writer_recurse<true>();
+ return true;
+ }
+ if (lock.u_lock_try())
+ {
+ ut_ad(!recursive);
+ recursive= RECURSIVE_U;
+ set_first_owner(for_io ? FOR_IO : id);
+ return true;
+ }
+ return false;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_try()
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ writer_recurse<false>();
+ return true;
+ }
+ if (lock.wr_lock_try())
+ {
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(id);
+ return true;
+ }
+ return false;
+}
diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h
new file mode 100644
index 00000000..168a6897
--- /dev/null
+++ b/storage/innobase/include/transactional_lock_guard.h
@@ -0,0 +1,174 @@
+/*****************************************************************************
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+
+#if defined __powerpc64__
+#elif defined __s390__
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) && !defined(__clang__)
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if __GNUC__ >= 8
+# elif defined __clang_major__ && __clang_major__ > 6
+# else
+# define NO_ELISION
+# endif
+#else /* Transactional memory has not been implemented for this ISA */
+# define NO_ELISION
+#endif
+
+#ifdef NO_ELISION
+constexpr bool have_transactional_memory= false;
+# ifdef UNIV_DEBUG
+static inline bool xtest() { return false; }
+# endif
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+#else
+# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+
+# include <immintrin.h>
+# if defined __GNUC__ && !defined __INTEL_COMPILER
+# define TRANSACTIONAL_TARGET __attribute__((target("rtm"),hot))
+# define TRANSACTIONAL_INLINE __attribute__((target("rtm"),hot,always_inline))
+# else
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+# endif
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+ return have_transactional_memory && _xbegin() == _XBEGIN_STARTED;
+}
+
+# ifdef UNIV_DEBUG
+# ifdef __GNUC__
+/** @return whether a memory transaction is active */
+bool xtest();
+# else
+static inline bool xtest() { return have_transactional_memory && _xtest(); }
+# endif
+# endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); }
+
+TRANSACTIONAL_INLINE static inline void xend() { _xend(); }
+# elif defined __powerpc64__ || defined __s390__
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+# define TRANSACTIONAL_TARGET __attribute__((hot))
+# define TRANSACTIONAL_INLINE __attribute__((hot,always_inline))
+
+/**
+ Newer gcc compilers only provide __builtin_{htm}
+ functions when the -mhtm CFLAG is actually provided. So
+ we've got the option of including it globally, or
+ pushing down the inclusion of htmxlintrin.h to one
+ file with -mhtm enabled and removing the inline
+ optimization.
+
+ Per FIXME in s390x's htmxlintrin.h, the __TM_simple_begin
+ isn't always_inline resulting in duplicate definitions if
+ it where included more than once. While xabort and xend
+ could be implemented here, we keep the implementation the
+ same as ppc64.
+ */
+TRANSACTIONAL_TARGET bool xbegin();
+TRANSACTIONAL_TARGET void xabort();
+TRANSACTIONAL_TARGET void xend();
+# ifdef UNIV_DEBUG
+bool xtest();
+# endif
+
+# endif
+#endif
+
+template<class mutex>
+class transactional_lock_guard
+{
+ mutex &m;
+
+public:
+ TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ m.lock();
+ }
+ transactional_lock_guard(const transactional_lock_guard &)= delete;
+ TRANSACTIONAL_INLINE ~transactional_lock_guard()
+ {
+#ifndef NO_ELISION
+ if (was_elided()) xend(); else
+#endif
+ m.unlock();
+ }
+
+#ifndef NO_ELISION
+ bool was_elided() const noexcept { return !m.is_locked_or_waiting(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+template<class mutex>
+class transactional_shared_lock_guard
+{
+ mutex &m;
+#ifndef NO_ELISION
+ bool elided;
+#else
+ static constexpr bool elided= false;
+#endif
+
+public:
+ TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (!m.is_write_locked())
+ {
+ elided= true;
+ return;
+ }
+ xabort();
+ }
+ elided= false;
+#endif
+ m.lock_shared();
+ }
+ transactional_shared_lock_guard(const transactional_shared_lock_guard &)=
+ delete;
+ TRANSACTIONAL_INLINE ~transactional_shared_lock_guard()
+ {
+#ifndef NO_ELISION
+ if (was_elided()) xend(); else
+#endif
+ m.unlock_shared();
+ }
+
+ bool was_elided() const noexcept { return elided; }
+};
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
new file mode 100644
index 00000000..caacfa09
--- /dev/null
+++ b/storage/innobase/include/trx0i_s.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "trx0types.h"
+#include "dict0types.h"
+#include "buf0types.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN 8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN 1024
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_foreign_key_error */
+#define TRX_I_S_TRX_FK_ERROR_MAX_LEN 256
+
+/** Safely copy strings in to the INNODB_TRX table's
+string based columns */
+#define TRX_I_S_STRING_COPY(data, field, constraint, tcache) \
+do { \
+ if (strlen(data) > constraint) { \
+ char buff[constraint + 1]; \
+ strncpy(buff, data, constraint); \
+ buff[constraint] = '\0'; \
+ \
+ field = static_cast<const char*>( \
+ ha_storage_put_memlim( \
+ (tcache)->storage, buff, constraint + 1,\
+ MAX_ALLOWED_FOR_STORAGE(tcache))); \
+ } else { \
+ field = static_cast<const char*>( \
+ ha_storage_put_str_memlim( \
+ (tcache)->storage, data, \
+ MAX_ALLOWED_FOR_STORAGE(tcache))); \
+ } \
+} while (0)
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+struct i_s_locks_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t {
+ i_s_locks_row_t* value; /*!< row of
+ INFORMATION_SCHEMA.innodb_locks*/
+ i_s_hash_chain_t* next; /*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_t {
+ trx_id_t lock_trx_id; /*!< transaction identifier */
+ const char* lock_table; /*!< table name from
+ lock_get_table_name() */
+ /** index name of a record lock; NULL for table locks */
+ const char* lock_index;
+ /** page identifier of the record; (0,0) if !lock_index */
+ page_id_t lock_page;
+ /** heap number of the record; 0 if !lock_index */
+ uint16_t lock_rec;
+ /** lock mode corresponding to lock_mode_values_typelib */
+ uint8_t lock_mode;
+ /** (some) content of the record, if available in the buffer pool;
+ NULL if !lock_index */
+ const char* lock_data;
+
+ /** The following are auxiliary and not included in the table */
+ /* @{ */
+ table_id_t lock_table_id;
+ /*!< table identifier from
+ lock_get_table_id */
+ i_s_hash_chain_t hash_chain; /*!< hash table chain node for
+ trx_i_s_cache_t::locks_hash */
+ /* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_t {
+ trx_id_t trx_id; /*!< transaction identifier */
+ const char* trx_state;
+ time_t trx_started; /*!< trx_t::start_time */
+ const i_s_locks_row_t* requested_lock_row;
+ /*!< pointer to a row
+ in innodb_locks if trx
+ is waiting, or NULL */
+ time_t trx_wait_started; /*!< trx_t->lock.wait_started */
+ uintmax_t trx_weight; /*!< TRX_WEIGHT() */
+ ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */
+ const char* trx_query; /*!< MySQL statement being
+ executed in the transaction */
+ CHARSET_INFO* trx_query_cs; /*!< the charset of trx_query */
+ const char* trx_operation_state; /*!< trx_t::op_info */
+ ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in
+ trx_t */
+ ulint trx_tables_locked;
+ /*!< mysql_n_tables_locked in
+ trx_t */
+ ulint trx_lock_structs;/*!< list len of trx_locks in
+ trx_t */
+ ulint trx_lock_memory_bytes;
+ /*!< mem_heap_get_size(
+ trx->lock_heap) */
+ ulint trx_rows_locked;/*!< trx_lock_t::n_rec_locks */
+ uintmax_t trx_rows_modified;/*!< trx_t::undo_no */
+ uint trx_isolation_level;
+ /*!< trx_t::isolation_level */
+ bool trx_unique_checks;
+ /*!< check_unique_secondary in trx_t*/
+ bool trx_foreign_key_checks;
+ /*!< check_foreigns in trx_t */
+ const char* trx_foreign_key_error;
+ /*!< detailed_error in trx_t */
+ bool trx_is_read_only;
+ /*!< trx_t::read_only */
+ bool trx_is_autocommit_non_locking;
+ /*!< trx:t::is_autocommit_non_locking()
+ */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_t {
+ const i_s_locks_row_t* requested_lock_row; /*!< requested lock */
+ const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+ I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */
+ I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */
+ I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t* trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table); /*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n); /*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache */
+
+/*******************************************************************//**
+Returns true, if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size);/*!< in: size of the lock id
+ buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
new file mode 100644
index 00000000..3ddd2e98
--- /dev/null
+++ b/storage/innobase/include/trx0purge.h
@@ -0,0 +1,427 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "trx0sys.h"
+#include "que0types.h"
+#include "srw_lock.h"
+
+#include <queue>
+#include <unordered_map>
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in] trx transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr);
+
+/**
+Remove unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller (purge_coordinator_callback)
+must not have any latches on undo log pages!
+*/
+void trx_purge_truncate_history();
+
+/**
+Run a purge batch.
+@param n_tasks number of purge tasks to submit to the queue
+@param history_size trx_sys.history_size()
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, ulint history_size);
+
+/** Rollback segements from a given transaction with trx-no
+scheduled for purge. */
+class TrxUndoRsegs {
+private:
+ typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> >
+ trx_rsegs_t;
+public:
+ typedef trx_rsegs_t::iterator iterator;
+ typedef trx_rsegs_t::const_iterator const_iterator;
+
+ TrxUndoRsegs() = default;
+
+ /** Constructor */
+ TrxUndoRsegs(trx_rseg_t& rseg)
+ : trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {}
+ /** Constructor */
+ TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg)
+ : trx_no(trx_no), m_rsegs(1, &rseg) {}
+
+ bool operator!=(const TrxUndoRsegs& other) const
+ { return trx_no != other.trx_no; }
+ bool empty() const { return m_rsegs.empty(); }
+ void erase(iterator& it) { m_rsegs.erase(it); }
+ iterator begin() { return(m_rsegs.begin()); }
+ iterator end() { return(m_rsegs.end()); }
+ const_iterator begin() const { return m_rsegs.begin(); }
+ const_iterator end() const { return m_rsegs.end(); }
+
+ /** Compare two TrxUndoRsegs based on trx_no.
+ @param elem1 first element to compare
+ @param elem2 second element to compare
+ @return true if elem1 > elem2 else false.*/
+ bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs)
+ {
+ return(lhs.trx_no > rhs.trx_no);
+ }
+
+ /** Copy of trx_rseg_t::last_trx_no() */
+ trx_id_t trx_no= 0;
+private:
+ /** Rollback segments of a transaction, scheduled for purge. */
+ trx_rsegs_t m_rsegs{};
+};
+
+typedef std::priority_queue<
+ TrxUndoRsegs,
+ std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >,
+ TrxUndoRsegs> purge_pq_t;
+
+/** Chooses the rollback segment with the oldest committed transaction */
+struct TrxUndoRsegsIterator {
+ /** Constructor */
+ TrxUndoRsegsIterator();
+ /** Sets the next rseg to purge in purge_sys.
+ Executed in the purge coordinator thread.
+ @retval false when nothing is to be purged
+ @retval true when purge_sys.rseg->latch was locked */
+ inline bool set_next();
+
+private:
+ // Disable copying
+ TrxUndoRsegsIterator(const TrxUndoRsegsIterator&);
+ TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&);
+
+ /** The current element to process */
+ TrxUndoRsegs m_rsegs;
+ /** Track the current element in m_rsegs */
+ TrxUndoRsegs::const_iterator m_iter;
+};
+
+/** The control structure used in the purge operation */
+class purge_sys_t
+{
+ friend TrxUndoRsegsIterator;
+public:
+ /** latch protecting view, m_enabled */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch;
+private:
+ /** Read view at the start of a purge batch. Any encountered index records
+ that are older than view will be removed. */
+ ReadViewBase view;
+ /** whether the subsystem has been initialized */
+ bool m_initialized{false};
+ /** whether purge is enabled; protected by latch and std::atomic */
+ std::atomic<bool> m_enabled{false};
+public:
+ /** whether purge is active (may hold table handles) */
+ std::atomic<bool> m_active{false};
+private:
+ /** number of pending stop() calls without resume() */
+ Atomic_counter<uint32_t> m_paused;
+ /** number of stop_SYS() calls without resume_SYS() */
+ Atomic_counter<uint32_t> m_SYS_paused;
+ /** number of stop_FTS() calls without resume_FTS() */
+ Atomic_counter<uint32_t> m_FTS_paused;
+
+ /** latch protecting end_view */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch;
+ /** Read view at the end of a purge batch (copied from view). Any undo pages
+ containing records older than end_view may be freed. */
+ ReadViewBase end_view;
+
+ struct hasher
+ {
+ size_t operator()(const page_id_t &id) const { return size_t(id.raw()); }
+ };
+
+ using unordered_map =
+ std::unordered_map<const page_id_t, buf_block_t*, hasher,
+#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+ std::equal_to<page_id_t>
+ /* GCC 4.8.5 would fail to find a matching allocator */
+#else
+ std::equal_to<page_id_t>,
+ ut_allocator<std::pair<const page_id_t, buf_block_t*>>
+#endif
+ >;
+ /** map of buffer-fixed undo log pages processed during a purge batch */
+ unordered_map pages;
+public:
+ /** @return the number of processed undo pages */
+ size_t n_pages_handled() const { return pages.size(); }
+
+ /** Look up an undo log page.
+ @param id undo page identifier
+ @return undo page
+ @retval nullptr in case the page is corrupted */
+ buf_block_t *get_page(page_id_t id);
+
+ que_t* query; /*!< The query graph which will do the
+ parallelized purge operation */
+
+ /** Iterator to the undo log records of committed transactions */
+ struct iterator
+ {
+ bool operator<=(const iterator& other) const
+ {
+ if (trx_no < other.trx_no) return true;
+ if (trx_no > other.trx_no) return false;
+ return undo_no <= other.undo_no;
+ }
+
+ /** Free the undo pages up to this. */
+ dberr_t free_history() const;
+
+ /** trx_t::no of the committed transaction */
+ trx_id_t trx_no;
+ /** The record number within the committed transaction's undo
+ log, increasing, purged from from 0 onwards */
+ undo_no_t undo_no;
+ };
+
+ /** The tail of the purge queue; the last parsed undo log of a
+ committed transaction. */
+ iterator tail;
+ /** The head of the purge queue; any older undo logs of committed
+ transactions may be discarded (history list truncation).
+ Protected by latch. */
+ iterator head;
+ /*-----------------------------*/
+ bool next_stored; /*!< whether rseg holds the next record
+ to purge */
+ trx_rseg_t* rseg; /*!< Rollback segment for the next undo
+ record to purge */
+private:
+ uint32_t page_no; /*!< Page number for the next undo
+ record to purge, page number of the
+ log header, if dummy record */
+ uint32_t hdr_page_no; /*!< Header page of the undo log where
+ the next record to purge belongs */
+ uint16_t offset; /*!< Page offset for the next undo
+ record to purge, 0 if the dummy
+ record */
+ uint16_t hdr_offset; /*!< Header byte offset on the page */
+
+
+ TrxUndoRsegsIterator
+ rseg_iter; /*!< Iterator to get the next rseg
+ to process */
+public:
+ purge_pq_t purge_queue; /*!< Binary min-heap, ordered on
+ TrxUndoRsegs::trx_no. It is protected
+ by the pq_mutex */
+ mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */
+
+ /** Undo tablespace file truncation (only accessed by the
+ srv_purge_coordinator_thread) */
+ struct {
+ /** The undo tablespace that is currently being truncated */
+ fil_space_t* current;
+ /** The undo tablespace that was last truncated */
+ fil_space_t* last;
+ } truncate;
+
+ /** Create the instance */
+ void create();
+
+ /** Close the purge system on shutdown */
+ void close();
+
+ /** @return whether purge is enabled */
+ bool enabled() { return m_enabled.load(std::memory_order_relaxed); }
+ /** @return whether the purge coordinator is paused */
+ bool paused()
+ { return m_paused != 0; }
+
+ /** Enable purge at startup. */
+ void coordinator_startup()
+ {
+ ut_ad(!enabled());
+ m_enabled.store(true, std::memory_order_relaxed);
+ wake_if_not_active();
+ }
+
+ /** Disable purge at shutdown */
+ void coordinator_shutdown()
+ {
+ ut_ad(enabled());
+ m_enabled.store(false, std::memory_order_relaxed);
+ }
+
+ /** @return whether the purge tasks are active */
+ static bool running();
+
+ /** Stop purge during FLUSH TABLES FOR EXPORT. */
+ void stop();
+ /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+ void resume();
+
+ /** Close and reopen all tables in case of a MDL conflict with DDL */
+ dict_table_t *close_and_reopen(table_id_t id, THD *thd, MDL_ticket **mdl);
+private:
+ /** Suspend purge during a DDL operation on FULLTEXT INDEX tables */
+ void wait_FTS(bool also_sys);
+public:
+ /** Suspend purge in data dictionary tables */
+ void stop_SYS() { m_SYS_paused++; }
+ /** Resume purge in data dictionary tables */
+ static void resume_SYS(void *);
+
+ /** Pause purge during a DDL operation that could drop FTS_ tables. */
+ void stop_FTS();
+ /** Resume purge after stop_FTS(). */
+ void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); }
+ /** @return whether stop_SYS() is in effect */
+ bool must_wait_FTS() const { return m_FTS_paused; }
+
+private:
+ /**
+ Get the next record to purge and update the info in the purge system.
+ @param roll_ptr undo log pointer to the record
+ @return buffer-fixed reference to undo log record
+ @retval {nullptr,1} if the whole undo log can skipped in purge
+ @retval {nullptr,0} if nothing is left, or on corruption */
+ inline trx_purge_rec_t get_next_rec(roll_ptr_t roll_ptr);
+
+ /** Choose the next undo log to purge.
+ @return whether anything is to be purged */
+ bool choose_next_log();
+
+ /** Update the last not yet purged history log info in rseg when
+ we have purged a whole undo log. Advances also purge_trx_no
+ past the purged log. */
+ void rseg_get_next_history_log();
+
+public:
+ /**
+ Fetch the next undo log record from the history list to purge.
+ @return buffer-fixed reference to undo log record
+ @retval {nullptr,1} if the whole undo log can skipped in purge
+ @retval {nullptr,0} if nothing is left, or on corruption */
+ inline trx_purge_rec_t fetch_next_rec();
+
+ /** Determine if the history of a transaction is purgeable.
+ @param trx_id transaction identifier
+ @return whether the history is purgeable */
+ TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const;
+
+ /** A wrapper around ReadView::low_limit_no(). */
+ trx_id_t low_limit_no() const
+ {
+ /* This function may only be called by purge_coordinator_callback().
+
+ The purge coordinator task may call this without holding any latch,
+ because it is the only thread that may modify purge_sys.view.
+
+ Any other threads that access purge_sys.view must hold purge_sys.latch,
+ typically via purge_sys_t::view_guard. */
+ return view.low_limit_no();
+ }
+ /** A wrapper around ReadView::sees(). */
+ trx_id_t sees(trx_id_t id) const
+ {
+ /* This function may only be called by purge_coordinator_callback().
+
+ The purge coordinator task may call this without holding any latch,
+ because it is the only thread that may modify purge_sys.view.
+
+ Any other threads that access purge_sys.view must hold purge_sys.latch,
+ typically via purge_sys_t::view_guard. */
+ return view.sees(id);
+ }
+ /** A wrapper around trx_sys_t::clone_oldest_view(). */
+ template<bool also_end_view= false>
+ void clone_oldest_view()
+ {
+ if (!also_end_view)
+ wait_FTS(true);
+ latch.wr_lock(SRW_LOCK_CALL);
+ trx_sys.clone_oldest_view(&view);
+ if (also_end_view)
+ (end_view= view).
+ clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no);
+ latch.wr_unlock();
+ }
+
+ /** Wake up the purge threads if there is work to do. */
+ void wake_if_not_active();
+
+ /** Release undo pages and update end_view at the end of a purge batch.
+ @retval false when nothing is to be purged
+ @retval true when purge_sys.rseg->latch was locked */
+ inline void batch_cleanup(const iterator &head);
+
+ struct view_guard
+ {
+ inline view_guard();
+ inline ~view_guard();
+
+ /** @return purge_sys.view */
+ inline const ReadViewBase &view() const;
+ };
+
+ struct end_view_guard
+ {
+ inline end_view_guard();
+ inline ~end_view_guard();
+
+ /** @return purge_sys.end_view */
+ inline const ReadViewBase &view() const;
+ };
+
+ /** Stop the purge thread and check n_ref_count of all auxiliary
+ and common table associated with the fts table.
+ @param table parent FTS table
+ @param already_stopped True indicates purge threads were
+ already stopped */
+ void stop_FTS(const dict_table_t &table, bool already_stopped=false);
+};
+
+/** The global data structure coordinating a purge */
+extern purge_sys_t purge_sys;
+
+purge_sys_t::view_guard::view_guard()
+{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); }
+
+purge_sys_t::view_guard::~view_guard()
+{ purge_sys.latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::view_guard::view() const
+{ return purge_sys.view; }
+
+purge_sys_t::end_view_guard::end_view_guard()
+{ purge_sys.end_latch.rd_lock(); }
+
+purge_sys_t::end_view_guard::~end_view_guard()
+{ purge_sys.end_latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::end_view_guard::view() const
+{ return purge_sys.end_view; }
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
new file mode 100644
index 00000000..3d9b1868
--- /dev/null
+++ b/storage/innobase/include/trx0rec.h
@@ -0,0 +1,299 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "trx0types.h"
+#include "row0types.h"
+#include "page0types.h"
+#include "que0types.h"
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec)
+{
+ return mach_u64_read_much_compressed(undo_rec + 3);
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no) \
+ ((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ byte* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ byte* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ bool* updated_extern, /*!< out: true if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ table_id_t* table_id) /*!< out: table id */
+ MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+const byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ const byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t**ref, /*!< out, own: row reference */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ const byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ byte* info_bits); /*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undorecord */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ byte info_bits,/*!< in: info bits from this undo record */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd); /*!< out, own: update vector */
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@return DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index; in updates,
+ may contain a clustered index
+ record tuple that also contains
+ virtual columns of the table;
+ otherwise, NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index; NULL if insert */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the
+ undo log record */
+ MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+
+/** status bit used for trx_undo_prev_version_build() */
+
+/** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it
+is being called purge view and we would like to get the purge record
+even it is in the purge view (in normal case, it will return without
+fetching the purge record */
+static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
+
+/** This tells trx_undo_prev_version_build() to fetch the old value in
+the undo log (which is the after image for an update) */
+static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
+
+/** indicate a call from row_vers_old_has_index_entry() */
+static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param rec version of a clustered index record
+@param index clustered index
+@param offsets rec_get_offsets(rec, index)
+@param heap memory heap from which the memory needed is
+ allocated
+@param old_vers previous version or NULL if rec is the
+ first inserted version, or if history data
+ has been deleted (an error), or if the purge
+ could have removed the version
+ though it has not yet done so
+@param v_heap memory heap used to create vrow
+ dtuple if it is not yet created. This heap
+ diffs from "heap" above in that it could be
+ prebuilt->old_vers_heap for selection
+@param vrow virtual column info, if any
+@param v_status status determine if it is going into this
+ function by purge thread or not.
+ And if we read "after image" of undo log
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+dberr_t
+trx_undo_prev_version_build(
+ const rec_t *rec,
+ dict_index_t *index,
+ rec_offs *offsets,
+ mem_heap_t *heap,
+ rec_t **old_vers,
+ mem_heap_t *v_heap,
+ dtuple_t **vrow,
+ ulint v_status);
+
+/** Read from an undo log record a non-virtual column value.
+@param ptr pointer to remaining part of the undo record
+@param field stored field
+@param len length of the field, or UNIV_SQL_NULL
+@param orig_len original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+ uint32_t *len, uint32_t *orig_len);
+
+/** Read virtual column value from undo log
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in,out] row the dtuple to fill
+@param[in] in_purge whether this is called by purge */
+void
+trx_undo_read_v_cols(
+ const dict_table_t* table,
+ const byte* ptr,
+ dtuple_t* row,
+ bool in_purge);
+
+/** Read virtual column index from undo log if the undo log contains such
+info, and verify the column is still indexed, and output its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in] first_v_col if this is the first virtual column, which
+ has the version marker
+@param[in,out] is_undo_log his function is used to parse both undo log,
+ and online log for virtual columns. So
+ check to see if this is undo log
+@param[out] field_no the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+ const dict_table_t* table,
+ const byte* ptr,
+ bool first_v_col,
+ bool* is_undo_log,
+ uint32_t* field_no);
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+/** Undo log records for DDL operations
+
+Note: special rollback and purge triggers exist for SYS_INDEXES records:
+@see dict_drop_index_tree() */
+enum trx_undo_ddl_type
+{
+ /** RENAME TABLE (logging the old table name).
+
+ Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records
+ for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */
+ TRX_UNDO_RENAME_TABLE= 9,
+ /** insert a metadata pseudo-record for instant ALTER TABLE */
+ TRX_UNDO_INSERT_METADATA= 10
+};
+
+/* DML operations */
+#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
+#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
+ record */
+#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to
+ a not delete marked record; also the
+ fields of the record can change */
+#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields
+ do not change */
+/** Bulk insert operation. It is written only when the table is
+under exclusive lock and the clustered index root page latch is being held,
+and the clustered index is empty. Rollback will empty the table and
+free the leaf segment of all indexes, re-create the new
+leaf segment and re-initialize the root page alone. */
+#define TRX_UNDO_EMPTY 15
+
+#define TRX_UNDO_CMPL_INFO_MULT 16U /* compilation info is multiplied by
+ this and ORed to the type above */
+#define TRX_UNDO_UPD_EXTERN 128U /* This bit can be ORed to type_cmpl
+ to denote that we updated external
+ storage fields: used by purge to
+ free the external storage */
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */
+extern const dtuple_t trx_undo_metadata;
+
+/** Read the table id from an undo log record.
+@param[in] rec Undo log record
+@return table id stored as a part of undo log record */
+inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
+{
+ rec+= 3;
+ mach_read_next_much_compressed(&rec);
+ return mach_read_next_much_compressed(&rec);
+}
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
new file mode 100644
index 00000000..9ef9ebe9
--- /dev/null
+++ b/storage/innobase/include/trx0roll.h
@@ -0,0 +1,168 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "trx0trx.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+extern bool trx_rollback_is_active;
+extern const trx_t* trx_roll_crash_recv_trx;
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress();
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+@param all true=roll back all recovered active transactions;
+false=roll back any incomplete dictionary transaction */
+void
+trx_rollback_recovered(bool all);
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*);
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_for_mysql(
+/*===================*/
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Rollback node states */
+enum roll_node_state {
+ ROLL_NODE_NONE = 0, /*!< Unknown state */
+ ROLL_NODE_SEND, /*!< about to send a rollback signal to
+ the transaction */
+ ROLL_NODE_WAIT /*!< rollback signal sent to the
+ transaction, waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */
+ enum roll_node_state state; /*!< node execution state */
+ const trx_savept_t* savept; /*!< savepoint to which to
+ roll back, in the case of a
+ partial rollback */
+ que_thr_t* undo_thr;/*!< undo query graph */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_t{
+ char* name; /*!< savepoint name */
+ trx_savept_t savept; /*!< the undo number corresponding to
+ the savepoint */
+ int64_t mysql_binlog_cache_pos;
+ /*!< the MySQL binlog cache position
+ corresponding to this savepoint, not
+ defined if the MySQL binlogging is not
+ enabled */
+ UT_LIST_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< the list of savepoints of a
+ transaction */
+};
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
new file mode 100644
index 00000000..43e0c290
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.h
@@ -0,0 +1,301 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "trx0types.h"
+#include "fut0lst.h"
+
+/** Create a rollback segment header.
+@param[in,out] space system, undo, or temporary tablespace
+@param[in] rseg_id rollback segment identifier
+@param[in] max_trx_id new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return the created rollback segment
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+ trx_id_t max_trx_id, mtr_t *mtr,
+ dberr_t *err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init();
+
+/** Create the temporary rollback segments. */
+dberr_t trx_temp_rseg_create(mtr_t *mtr);
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS (srv_page_size / 16)
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2)
+
+/** The rollback segment memory object */
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
+{
+ /** tablespace containing the rollback segment; constant after init() */
+ fil_space_t *space;
+ /** latch protecting everything except page_no, space */
+ srw_spin_lock latch;
+ /** rollback segment header page number; constant after init() */
+ uint32_t page_no;
+ /** length of the TRX_RSEG_HISTORY list (number of transactions) */
+ uint32_t history_size;
+
+ /** Last known transaction that has not been purged yet,
+ or 0 if everything has been purged. */
+ trx_id_t needs_purge;
+
+private:
+ /** Reference counter to track is_persistent() transactions,
+ with SKIP flag. */
+ std::atomic<uint32_t> ref;
+
+ /** Whether undo tablespace truncation is pending */
+ static constexpr uint32_t SKIP= 1;
+ /** Transaction reference count multiplier */
+ static constexpr uint32_t REF= 2;
+
+ uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); }
+
+ /** Set the SKIP bit */
+ void ref_set_skip()
+ {
+ static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ _interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+ ref.fetch_or(SKIP, std::memory_order_relaxed);
+#endif
+ }
+ /** Clear a bit in ref */
+ void ref_reset_skip()
+ {
+ static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ _interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+ ref.fetch_and(~SKIP, std::memory_order_relaxed);
+#endif
+ }
+
+public:
+
+ /** Initialize the fields that are not zero-initialized. */
+ void init(fil_space_t *space, uint32_t page);
+ /** Reinitialize the fields on undo tablespace truncation. */
+ void reinit(uint32_t page);
+ /** Clean up. */
+ void destroy();
+
+ /** Note that undo tablespace truncation was started. */
+ void set_skip_allocation() { ut_ad(is_persistent()); ref_set_skip(); }
+ /** Note that undo tablespace truncation was completed. */
+ void clear_skip_allocation()
+ {
+ ut_ad(is_persistent());
+#if defined DBUG_OFF
+ ref_reset_skip();
+#else
+ ut_d(auto r=) ref.fetch_and(~SKIP, std::memory_order_relaxed);
+ ut_ad(r == SKIP);
+#endif
+ }
+ /** @return whether the segment is marked for undo truncation */
+ bool skip_allocation() const
+ { return ref.load(std::memory_order_acquire) & SKIP; }
+ /** Increment the reference count */
+ void acquire()
+ { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); }
+ /** Increment the reference count if possible
+ @retval true if the reference count was incremented
+ @retval false if skip_allocation() holds */
+ bool acquire_if_available()
+ {
+ uint32_t r= 0;
+ while (!ref.compare_exchange_weak(r, r + REF,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed))
+ if (r & SKIP)
+ return false;
+ return true;
+ }
+
+ /** Decrement the reference count */
+ void release()
+ {
+ ut_d(const auto r=)
+ ref.fetch_sub(REF, std::memory_order_relaxed);
+ ut_ad(r >= REF);
+ }
+ /** @return whether references exist */
+ bool is_referenced() const { return ref_load() >= REF; }
+
+ /** current size in pages */
+ uint32_t curr_size;
+
+ /** List of undo logs (transactions) */
+ UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+ /** List of undo log segments cached for fast reuse */
+ UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
+
+ /** Last not yet purged undo log header; FIL_NULL if all purged */
+ uint32_t last_page_no;
+
+ /** trx_t::no | last_offset << 48 */
+ uint64_t last_commit_and_offset;
+
+ /** @return the commit ID of the last committed transaction */
+ trx_id_t last_trx_no() const
+ { return last_commit_and_offset & ((1ULL << 48) - 1); }
+ /** @return header offset of the last committed transaction */
+ uint16_t last_offset() const
+ { return static_cast<uint16_t>(last_commit_and_offset >> 48); }
+
+ void set_last_commit(uint16_t last_offset, trx_id_t trx_no)
+ {
+ last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
+ }
+
+ /** @return the page identifier */
+ page_id_t page_id() const { return page_id_t{space->id, page_no}; }
+
+ /** @return the rollback segment header page, exclusively latched */
+ buf_block_t *get(mtr_t *mtr, dberr_t *err) const;
+
+ /** @return whether the rollback segment is persistent */
+ bool is_persistent() const
+ {
+ ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+ (srv_undo_space_id_start > 0 &&
+ space->id >= srv_undo_space_id_start &&
+ space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES));
+ ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+ !srv_was_started ||
+ (srv_undo_space_id_start > 0 &&
+ space->id >= srv_undo_space_id_start
+ && space->id <= srv_undo_space_id_start +
+ srv_undo_tablespaces_open));
+ return space->id != SRV_TMP_SPACE_ID;
+ }
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
+ an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE 4
+
+/* The offset of the rollback segment header on its page */
+#define TRX_RSEG FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */
+#define TRX_RSEG_FORMAT 0
+/** Number of pages in the TRX_RSEG_HISTORY list */
+#define TRX_RSEG_HISTORY_SIZE 4
+/** Committed transaction logs that have not been purged yet */
+#define TRX_RSEG_HISTORY 8
+#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE)
+ /* Header for the file segment where
+ this page is placed */
+#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+ /* Undo log segment slots */
+/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */
+#define TRX_RSEG_MAX_TRX_ID (TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS \
+ * TRX_RSEG_SLOT_SIZE)
+
+/** 8 bytes offset within the binlog file */
+#define TRX_RSEG_BINLOG_OFFSET TRX_RSEG_MAX_TRX_ID + 8
+/** MySQL log file name, 512 bytes, including terminating NUL
+(valid only if TRX_RSEG_FORMAT is 0).
+If no binlog information is present, the first byte is NUL. */
+#define TRX_RSEG_BINLOG_NAME TRX_RSEG_MAX_TRX_ID + 16
+/** Maximum length of binlog file name, including terminating NUL, in bytes */
+#define TRX_RSEG_BINLOG_NAME_LEN 512
+
+#ifdef WITH_WSREP
+# include "trx0xa.h"
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr);
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in] xid WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid);
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out] xid WSREP XID
+@return whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid);
+#endif /* WITH_WSREP */
+
+/** Read the page number of an undo log slot.
+@param[in] rseg_header rollback segment header
+@param[in] n slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
+{
+ ut_ad(n < TRX_RSEG_N_SLOTS);
+ return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+ n * TRX_RSEG_SLOT_SIZE + rseg_header->page.frame);
+}
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out] rseg_header rollback segment header page
+@param[in,out] mtr mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr);
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out] rseg_header rollback segment header
+@param[in] log_file_name binlog file name
+@param[in] log_offset binlog offset value
+@param[in,out] mtr mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header,
+ const char *log_file_name,
+ ulonglong log_offset,
+ mtr_t *mtr);
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
new file mode 100644
index 00000000..5dd0169f
--- /dev/null
+++ b/storage/innobase/include/trx0sys.h
@@ -0,0 +1,1274 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "trx0rseg.h"
+#include "mem0mem.h"
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+#include "trx0trx.h"
+#include "ilist.h"
+#include "my_cpu.h"
+
+#ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t trx_sys_mutex_key;
+#endif
+
+/** Checks if a page address is the trx sys header page.
+@param[in] page_id page id
+@return true if trx sys header page */
+inline bool trx_sys_hdr_page(const page_id_t page_id)
+{
+ return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr);
+
+/** Find an available rollback segment.
+@param[in] sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header);
+/** Request the TRX_SYS page.
+@param[in] rw whether to lock the page for writing
+@return the TRX_SYS page
+@retval NULL if the page cannot be read */
+inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
+{
+ return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+ 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+extern uint trx_rseg_n_slots_debug;
+#endif
+
+/** Write DB_TRX_ID.
+@param[out] db_trx_id the DB_TRX_ID field to be written to
+@param[in] id transaction ID */
+UNIV_INLINE
+void
+trx_write_trx_id(byte* db_trx_id, trx_id_t id)
+{
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ mach_write_to_6(db_trx_id, id);
+}
+
+/** Read a transaction identifier.
+@return id */
+inline
+trx_id_t
+trx_read_trx_id(const byte* ptr)
+{
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ return(mach_read_from_6(ptr));
+}
+
+#ifdef UNIV_DEBUG
+/** Check that the DB_TRX_ID in a record is valid.
+@param[in] db_trx_id the DB_TRX_ID column to validate
+@param[in] trx_id the id of the ALTER TABLE transaction */
+inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
+{
+ trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
+ ut_ad(id == 0 || id > trx_id);
+ return true;
+}
+#endif
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ const char* file_name,/*!< in: MySQL log file name */
+ int64_t offset, /*!< in: position in that log file */
+ buf_block_t* sys_header, /*!< in,out: trx sys header */
+ mtr_t* mtr); /*!< in,out: mini-transaction */
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset();
+
+/** Create the rollback segments.
+@return whether the creation succeeded */
+bool
+trx_sys_create_rsegs();
+
+/** The offset of the transaction system header on the page */
+#define TRX_SYS FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+/** In old versions of InnoDB, this persisted the value of
+trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
+the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
+and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
+are used instead. The field only exists for the purpose of upgrading
+from older MySQL or MariaDB versions. */
+#define TRX_SYS_TRX_ID_STORE 0
+#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
+ tablespace segment the trx
+ system is created into */
+#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
+ /*!< the start of the array of
+ rollback segment specification
+ slots */
+
+/* Rollback segment specification slot offsets */
+
+/** the tablespace ID of an undo log header; starting with
+MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
+#define TRX_SYS_RSEG_SPACE 0
+/** the page number of an undo log header, or FIL_NULL if unused */
+#define TRX_SYS_RSEG_PAGE_NO 4
+/** Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE 8
+
+/** Read the tablespace ID of a rollback segment slot.
+@param[in] sys_header TRX_SYS page
+@param[in] rseg_id rollback segment identifier
+@return undo tablespace id */
+inline
+uint32_t
+trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
+{
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+ return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+ + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+ + sys_header->page.frame);
+}
+
+/** Read the page number of a rollback segment slot.
+@param[in] sys_header TRX_SYS page
+@param[in] rseg_id rollback segment identifier
+@return undo page number */
+inline uint32_t
+trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
+{
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+ return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
+ sys_header->page.frame);
+}
+
+/** Maximum length of MySQL binlog file name, in bytes.
+(Used before MariaDB 10.3.5.) */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
+
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
+#endif
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000)
+#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
+ TRX_SYS_MYSQL_LOG_MAGIC_N
+ if we have valid data in the
+ MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset
+ within that file */
+#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
+
+/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
+
+0...37 FIL_HEADER
+38...45 TRX_SYS_TRX_ID_STORE
+46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
+56 TRX_SYS_RSEGS
+ 56...59 TRX_SYS_RSEG_SPACE for slot 0
+ 60...63 TRX_SYS_RSEG_PAGE_NO for slot 0
+ 64...67 TRX_SYS_RSEG_SPACE for slot 1
+ 68...71 TRX_SYS_RSEG_PAGE_NO for slot 1
+....
+ 594..597 TRX_SYS_RSEG_SPACE for slot 72
+ 598..601 TRX_SYS_RSEG_PAGE_NO for slot 72
+...
+ ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126
+
+(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
+space_id, page_no pairs :::)
+596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
+600 TRX_SYS_WSREP_XID_FORMAT
+604 TRX_SYS_WSREP_XID_GTRID_LEN
+608 TRX_SYS_WSREP_XID_BQUAL_LEN
+612 TRX_SYS_WSREP_XID_DATA (len = 128)
+739 TRX_SYS_WSREP_XID_DATA_END
+
+FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
+(srv_page_size-2500)
+1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
+1600 TRX_SYS_WSREP_XID_FORMAT
+1604 TRX_SYS_WSREP_XID_GTRID_LEN
+1608 TRX_SYS_WSREP_XID_BQUAL_LEN
+1612 TRX_SYS_WSREP_XID_DATA (len = 128)
+1739 TRX_SYS_WSREP_XID_DATA_END
+
+(srv_page_size - 2000 MYSQL MASTER LOG)
+2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
+2108 TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 1000 MYSQL LOG)
+3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
+3108 TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 200 DOUBLEWRITE)
+3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG
+3906 TRX_SYS_DOUBLEWRITE_MAGIC
+3910 TRX_SYS_DOUBLEWRITE_BLOCK1
+3914 TRX_SYS_DOUBLEWRITE_BLOCK2
+3918 TRX_SYS_DOUBLEWRITE_REPEAT
+3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
+
+(srv_page_size - 8, TAILER)
+4088..4096 FIL_TAILER
+
+*/
+#ifdef WITH_WSREP
+/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
+#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
+#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
+#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
+
+/** XID field: formatID, gtrid_len, bqual_len, xid_data */
+#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE)
+#define TRX_SYS_WSREP_XID_FORMAT 4
+#define TRX_SYS_WSREP_XID_GTRID_LEN 8
+#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
+#define TRX_SYS_WSREP_XID_DATA 16
+#endif /* WITH_WSREP*/
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
+ containing the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
+ /*!< 4-byte magic number which
+ shows if we already have
+ created the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the first
+ sequence of 64
+ (= FSP_EXTENT_SIZE) consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the second
+ sequence of 64 consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
+ TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
+ TRX_SYS_DOUBLEWRITE_BLOCK2
+ so that if the trx sys
+ header is half-written
+ to disk, we still may
+ be able to recover the
+ information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
+/* @} */
+
+trx_t* current_trx();
+
+struct rw_trx_hash_element_t
+{
+ rw_trx_hash_element_t()
+ {
+ memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+ mutex.init();
+ }
+
+
+ ~rw_trx_hash_element_t() { mutex.destroy(); }
+
+
+ trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
+
+ /**
+ Transaction serialization number.
+
+ Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY
+ state. Initially set to TRX_ID_MAX.
+ */
+ Atomic_counter<trx_id_t> no;
+ trx_t *trx;
+ srw_mutex mutex;
+};
+
+
+/**
+ Wrapper around LF_HASH to store set of in memory read-write transactions.
+*/
+
+class rw_trx_hash_t
+{
+ LF_HASH hash;
+
+
+ template <typename T>
+ using walk_action= my_bool(rw_trx_hash_element_t *element, T *action);
+
+
+ /**
+ Constructor callback for lock-free allocator.
+
+ Object is just allocated and is not yet accessible via rw_trx_hash by
+ concurrent threads. Object can be reused multiple times before it is freed.
+ Every time object is being reused initializer() callback is called.
+ */
+
+ static void rw_trx_hash_constructor(uchar *arg)
+ {
+ new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
+ }
+
+
+ /**
+ Destructor callback for lock-free allocator.
+
+ Object is about to be freed and is not accessible via rw_trx_hash by
+ concurrent threads.
+ */
+
+ static void rw_trx_hash_destructor(uchar *arg)
+ {
+ reinterpret_cast<rw_trx_hash_element_t*>
+ (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
+ }
+
+
+ /**
+ Destructor callback for lock-free allocator.
+
+ This destructor is used at shutdown. It frees remaining transaction
+ objects.
+
+ XA PREPARED transactions may remain if they haven't been committed or
+ rolled back. ACTIVE transactions may remain if startup was interrupted or
+ server is running in read-only mode or for certain srv_force_recovery
+ levels.
+ */
+
+ static void rw_trx_hash_shutdown_destructor(uchar *arg)
+ {
+ rw_trx_hash_element_t *element=
+ reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
+ if (trx_t *trx= element->trx)
+ {
+ ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+ (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
+ (!srv_was_started ||
+ srv_read_only_mode ||
+ srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
+ trx_free_at_shutdown(trx);
+ }
+ element->~rw_trx_hash_element_t();
+ }
+
+
+ /**
+ Initializer callback for lock-free hash.
+
+ Object is not yet accessible via rw_trx_hash by concurrent threads, but is
+ about to become such. Object id can be changed only by this callback and
+ remains the same until all pins to this object are released.
+
+ Object trx can be changed to 0 by erase() under object mutex protection,
+ which indicates it is about to be removed from lock-free hash and become
+ not accessible by concurrent threads.
+ */
+
+ static void rw_trx_hash_initializer(LF_HASH *,
+ rw_trx_hash_element_t *element,
+ trx_t *trx)
+ {
+ ut_ad(element->trx == 0);
+ element->trx= trx;
+ element->id= trx->id;
+ element->no= TRX_ID_MAX;
+ trx->rw_trx_hash_element= element;
+ }
+
+
+ /**
+ Gets LF_HASH pins.
+
+ Pins are used to protect object from being destroyed or reused. They are
+ normally stored in trx object for quick access. If caller doesn't have trx
+ available, we try to get it using currnet_trx(). If caller doesn't have trx
+ at all, temporary pins are allocated.
+ */
+
+ LF_PINS *get_pins(trx_t *trx)
+ {
+ if (!trx->rw_trx_hash_pins)
+ {
+ trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
+ ut_a(trx->rw_trx_hash_pins);
+ }
+ return trx->rw_trx_hash_pins;
+ }
+
+
+ template <typename T> struct eliminate_duplicates_arg
+ {
+ trx_ids_t ids;
+ walk_action<T> *action;
+ T *argument;
+ eliminate_duplicates_arg(size_t size, walk_action<T> *act, T *arg):
+ action(act), argument(arg) { ids.reserve(size); }
+ };
+
+
+ template <typename T>
+ static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
+ eliminate_duplicates_arg<T> *arg)
+ {
+ for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
+ {
+ if (*it == element->id)
+ return 0;
+ }
+ arg->ids.push_back(element->id);
+ return arg->action(element, arg->argument);
+ }
+
+
+#ifdef UNIV_DEBUG
+ static void validate_element(trx_t *trx)
+ {
+ ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
+ ut_ad(!trx->is_autocommit_non_locking());
+ /* trx->state can be anything except TRX_STATE_NOT_STARTED */
+ ut_d(trx->mutex_lock());
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_d(trx->mutex_unlock());
+ }
+
+
+ template <typename T> struct debug_iterator_arg
+ {
+ walk_action<T> *action;
+ T *argument;
+ };
+
+
+ template <typename T>
+ static my_bool debug_iterator(rw_trx_hash_element_t *element,
+ debug_iterator_arg<T> *arg)
+ {
+ element->mutex.wr_lock();
+ if (element->trx)
+ validate_element(element->trx);
+ element->mutex.wr_unlock();
+ ut_ad(element->id < element->no);
+ return arg->action(element, arg->argument);
+ }
+#endif
+
+
+public:
+ void init()
+ {
+ lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
+ sizeof(trx_id_t), 0, &my_charset_bin);
+ hash.alloc.constructor= rw_trx_hash_constructor;
+ hash.alloc.destructor= rw_trx_hash_destructor;
+ hash.initializer=
+ reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
+ }
+
+
+ void destroy()
+ {
+ hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
+ lf_hash_destroy(&hash);
+ }
+
+
+ /**
+ Releases LF_HASH pins.
+
+ Must be called by thread that owns trx_t object when the latter is being
+ "detached" from thread (e.g. released to the pool by trx_t::free()). Can be
+ called earlier if thread is expected not to use rw_trx_hash.
+
+ Since pins are not allowed to be transferred to another thread,
+ initialisation thread calls this for recovered transactions.
+ */
+
+ void put_pins(trx_t *trx)
+ {
+ if (trx->rw_trx_hash_pins)
+ {
+ lf_hash_put_pins(trx->rw_trx_hash_pins);
+ trx->rw_trx_hash_pins= 0;
+ }
+ }
+
+
+ /**
+ Finds trx object in lock-free hash with given id.
+
+ Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
+ the transaction may get committed before this method returns.
+
+ With do_ref_count == false the caller may dereference returned trx pointer
+ only if lock_sys.latch was acquired before calling find().
+
+ With do_ref_count == true caller may dereference trx even if it is not
+ holding lock_sys.latch. Caller is responsible for calling
+ trx->release_reference() when it is done playing with trx.
+
+ Ideally this method should get caller rw_trx_hash_pins along with trx
+ object as a parameter, similar to insert() and erase(). However most
+ callers lose trx early in their call chains and it is not that easy to pass
+ them through.
+
+ So we take more expensive approach: get trx through current_thd()->ha_data.
+ Some threads don't have trx attached to THD, and at least server
+ initialisation thread, fts_optimize_thread, srv_master_thread,
+ dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
+ have THD at all. For such cases we allocate pins only for duration of
+ search and free them immediately.
+
+ This has negative performance impact and should be fixed eventually (by
+ passing caller_trx as a parameter). Still stream of DML is more or less Ok.
+
+ @return
+ @retval 0 not found
+ @retval pointer to trx
+ */
+
+ trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count)
+ {
+ /*
+ In MariaDB 10.3, purge will reset DB_TRX_ID to 0
+ when the history is lost. Read/write transactions will
+ always have a nonzero trx_t::id; there the value 0 is
+ reserved for transactions that did not write or lock
+ anything yet.
+
+ The caller should already have handled trx_id==0 specially.
+ */
+ ut_ad(trx_id);
+ ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count);
+
+ trx_t *trx= 0;
+ LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+ ut_a(pins);
+
+ rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
+ (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
+ sizeof(trx_id_t)));
+ if (element)
+ {
+ /* rw_trx_hash_t::erase() sets element->trx to nullptr under
+ element->mutex protection before removing the element from hash table.
+ If the element was removed before the mutex acquisition, element->trx
+ will be equal to nullptr. */
+ DEBUG_SYNC_C("before_trx_hash_find_element_mutex_enter");
+ element->mutex.wr_lock();
+ /* element_trx can't point to reused object now. If transaction was
+ deregistered before element->mutex acquisition, element->trx is nullptr.
+ It can't be deregistered while element->mutex is held. */
+ trx_t *element_trx = element->trx;
+ lf_hash_search_unpin(pins);
+ /* The *element can be reused now, as element->trx value is stored
+ locally in element_trx. */
+ DEBUG_SYNC_C("after_trx_hash_find_element_mutex_enter");
+ if ((trx= element_trx)) {
+ DBUG_ASSERT(trx_id == trx->id);
+ ut_d(validate_element(trx));
+ if (do_ref_count)
+ {
+ /*
+ We have an early state check here to avoid committer
+ starvation in a wait loop for transaction references,
+ when there's a stream of trx_sys.find() calls from other
+ threads. The trx->state may change to COMMITTED after
+ trx->mutex is released, and it will have to be rechecked
+ by the caller after reacquiring the mutex.
+ */
+ /* trx_t::commit_in_memory() sets the state to
+ TRX_STATE_COMMITTED_IN_MEMORY before deregistering the transaction.
+ It also waits for any implicit-to-explicit lock conversions to cease
+ after deregistering. */
+ if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+ trx= nullptr;
+ else
+ trx->reference();
+ }
+ }
+ /* element's lifetime is equal to the hash lifetime, that's why
+ element->mutex is valid here despite the element is unpinned. In the
+ worst case some thread will wait for element->mutex releasing. */
+ element->mutex.wr_unlock();
+ }
+ if (!caller_trx)
+ lf_hash_put_pins(pins);
+ return trx;
+ }
+
+
+ /**
+ Inserts trx to lock-free hash.
+
+ Object becomes accessible via rw_trx_hash.
+ */
+
+ void insert(trx_t *trx)
+ {
+ ut_d(validate_element(trx));
+ int res= lf_hash_insert(&hash, get_pins(trx),
+ reinterpret_cast<void*>(trx));
+ ut_a(res == 0);
+ }
+
+
+ /**
+ Removes trx from lock-free hash.
+
+ Object becomes not accessible via rw_trx_hash. But it still can be pinned
+ by concurrent find(), which is supposed to release it immediately after
+ it sees object trx is 0.
+ */
+
+ void erase(trx_t *trx)
+ {
+ ut_d(validate_element(trx));
+ trx->rw_trx_hash_element->mutex.wr_lock();
+ trx->rw_trx_hash_element->trx= nullptr;
+ trx->rw_trx_hash_element->mutex.wr_unlock();
+ int res= lf_hash_delete(&hash, get_pins(trx),
+ reinterpret_cast<const void*>(&trx->id),
+ sizeof(trx_id_t));
+ ut_a(res == 0);
+ }
+
+
+ /**
+ Returns the number of elements in the hash.
+
+ The number is exact only if hash is protected against concurrent
+ modifications (e.g. single threaded startup or hash is protected
+ by some mutex). Otherwise the number may be used as a hint only,
+ because it may change even before this method returns.
+ */
+
+ uint32_t size() { return uint32_t(lf_hash_size(&hash)); }
+
+
+ /**
+ Iterates the hash.
+
+ @param caller_trx used to get/set pins
+ @param action called for every element in hash
+ @param argument opque argument passed to action
+
+ May return the same element multiple times if hash is under contention.
+ If caller doesn't like to see the same transaction multiple times, it has
+ to call iterate_no_dups() instead.
+
+ May return element with committed transaction. If caller doesn't like to
+ see committed transactions, it has to skip those under element mutex:
+
+ element->mutex.wr_lock();
+ if (trx_t trx= element->trx)
+ {
+ // trx is protected against commit in this branch
+ }
+ element->mutex.wr_unlock();
+
+ May miss concurrently inserted transactions.
+
+ @return
+ @retval 0 iteration completed successfully
+ @retval 1 iteration was interrupted (action returned 1)
+ */
+
+ template <typename T>
+ int iterate(trx_t *caller_trx, walk_action<T> *action, T *argument= nullptr)
+ {
+ LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+ ut_a(pins);
+#ifdef UNIV_DEBUG
+ debug_iterator_arg<T> debug_arg= { action, argument };
+ action= reinterpret_cast<decltype(action)>(debug_iterator<T>);
+ argument= reinterpret_cast<T*>(&debug_arg);
+#endif
+ int res= lf_hash_iterate(&hash, pins,
+ reinterpret_cast<my_hash_walk_action>(action),
+ const_cast<void*>(static_cast<const void*>
+ (argument)));
+ if (!caller_trx)
+ lf_hash_put_pins(pins);
+ return res;
+ }
+
+
+ template <typename T>
+ int iterate(walk_action<T> *action, T *argument= nullptr)
+ {
+ return iterate(current_trx(), action, argument);
+ }
+
+
+ /**
+ Iterates the hash and eliminates duplicate elements.
+
+ @sa iterate()
+ */
+
+ template <typename T>
+ int iterate_no_dups(trx_t *caller_trx, walk_action<T> *action,
+ T *argument= nullptr)
+ {
+ eliminate_duplicates_arg<T> arg(size() + 32, action, argument);
+ return iterate(caller_trx, eliminate_duplicates<T>, &arg);
+ }
+
+
+ template <typename T>
+ int iterate_no_dups(walk_action<T> *action, T *argument= nullptr)
+ {
+ return iterate_no_dups(current_trx(), action, argument);
+ }
+};
+
+class thread_safe_trx_ilist_t
+{
+public:
+ void create() { mysql_mutex_init(trx_sys_mutex_key, &mutex, nullptr); }
+ void close() { mysql_mutex_destroy(&mutex); }
+
+ bool empty() const
+ {
+ mysql_mutex_lock(&mutex);
+ auto result= trx_list.empty();
+ mysql_mutex_unlock(&mutex);
+ return result;
+ }
+
+ void push_front(trx_t &trx)
+ {
+ mysql_mutex_lock(&mutex);
+ trx_list.push_front(trx);
+ mysql_mutex_unlock(&mutex);
+ }
+
+ void remove(trx_t &trx)
+ {
+ mysql_mutex_lock(&mutex);
+ trx_list.remove(trx);
+ mysql_mutex_unlock(&mutex);
+ }
+
+ template <typename Callable> void for_each(Callable &&callback) const
+ {
+ mysql_mutex_lock(&mutex);
+ for (const auto &trx : trx_list)
+ callback(trx);
+ mysql_mutex_unlock(&mutex);
+ }
+
+ template <typename Callable> void for_each(Callable &&callback)
+ {
+ mysql_mutex_lock(&mutex);
+ for (auto &trx : trx_list)
+ callback(trx);
+ mysql_mutex_unlock(&mutex);
+ }
+
+ void freeze() const { mysql_mutex_lock(&mutex); }
+ void unfreeze() const { mysql_mutex_unlock(&mutex); }
+
+private:
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable mysql_mutex_t mutex;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist<trx_t> trx_list;
+};
+
+/** The transaction system central memory data structure. */
+class trx_sys_t
+{
+ /**
+ The smallest number not yet assigned as a transaction id or transaction
+ number. Accessed and updated with atomic operations.
+ */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<trx_id_t> m_max_trx_id;
+
+
+ /**
+ Solves race conditions between register_rw() and snapshot_ids() as well as
+ race condition between assign_new_trx_no() and snapshot_ids().
+
+ @sa register_rw()
+ @sa assign_new_trx_no()
+ @sa snapshot_ids()
+ */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+ std::atomic<trx_id_t> m_rw_trx_hash_version;
+
+
+ bool m_initialised;
+
+ /** False if there is no undo log to purge or rollback */
+ bool undo_log_nonempty;
+public:
+ /** List of all transactions. */
+ thread_safe_trx_ilist_t trx_list;
+
+ /** Temporary rollback segments */
+ trx_rseg_t temp_rsegs[TRX_SYS_N_RSEGS];
+
+ /** Persistent rollback segments; space==nullptr if slot not in use */
+ trx_rseg_t rseg_array[TRX_SYS_N_RSEGS];
+
+ /**
+ Lock-free hash of in memory read-write transactions.
+ Works faster when it is on it's own cache line (tested).
+ */
+
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) rw_trx_hash_t rw_trx_hash;
+
+
+#ifdef WITH_WSREP
+ /** Latest recovered XID during startup */
+ XID recovered_wsrep_xid;
+#endif
+ /** Latest recovered binlog offset */
+ uint64_t recovered_binlog_offset;
+ /** Latest recovered binlog file name */
+ char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
+ /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
+ lsn_t recovered_binlog_lsn;
+
+
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+
+ trx_sys_t(): m_initialised(false) {}
+
+
+ /**
+ @return TRX_RSEG_HISTORY length (number of committed transactions to purge)
+ */
+ size_t history_size();
+
+
+ /**
+ Check whether history_size() exceeds a specified number.
+ @param threshold number of committed transactions
+ @return whether TRX_RSEG_HISTORY length exceeds the threshold
+ */
+ bool history_exceeds(size_t threshold);
+
+
+ /**
+ @return approximate history_size(), without latch protection
+ */
+ TPOOL_SUPPRESS_TSAN size_t history_size_approx() const;
+
+
+ /**
+ @return whether history_size() is nonzero (with some race condition)
+ */
+ TPOOL_SUPPRESS_TSAN bool history_exists();
+
+
+ /**
+ Determine if the specified transaction or any older one might be active.
+
+ @param trx current transaction
+ @param id transaction identifier
+ @return whether any transaction not newer than id might be active
+ */
+
+ bool find_same_or_older(trx_t *trx, trx_id_t id)
+ {
+ if (trx->max_inactive_id >= id)
+ return false;
+ bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id);
+ if (!found)
+ trx->max_inactive_id= id;
+ return found;
+ }
+
+
+ /**
+ Determines the maximum transaction id.
+
+ @return maximum currently allocated trx id; will be stale after the
+ next call to trx_sys.get_new_trx_id()
+ */
+
+ trx_id_t get_max_trx_id()
+ {
+ return m_max_trx_id;
+ }
+
+
+ /**
+ Allocates a new transaction id.
+ @return new, allocated trx id
+ */
+
+ trx_id_t get_new_trx_id()
+ {
+ trx_id_t id= get_new_trx_id_no_refresh();
+ refresh_rw_trx_hash_version();
+ return id;
+ }
+
+
+ /**
+ Allocates and assigns new transaction serialisation number.
+
+ There's a gap between m_max_trx_id increment and transaction serialisation
+ number becoming visible through rw_trx_hash. While we're in this gap
+ concurrent thread may come and do MVCC snapshot without seeing allocated
+ but not yet assigned serialisation number. Then at some point purge thread
+ may clone this view. As a result it won't see newly allocated serialisation
+ number and may remove "unnecessary" history data of this transaction from
+ rollback segments.
+
+ m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+ to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+ means that all transaction serialisation numbers up to m_max_trx_id are
+ available through rw_trx_hash.
+
+ We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+ that m_rw_trx_hash_version increment happens after
+ trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
+
+ @param trx transaction
+ */
+ void assign_new_trx_no(trx_t *trx)
+ {
+ trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh();
+ refresh_rw_trx_hash_version();
+ }
+
+
+ /**
+ Takes MVCC snapshot.
+
+ To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements
+ in ids.
+
+ For details about get_rw_trx_hash_version() != get_max_trx_id() spin
+ @sa register_rw() and @sa assign_new_trx_no().
+
+ We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
+ that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
+
+ To optimise snapshot creation rw_trx_hash.iterate() is being used instead
+ of rw_trx_hash.iterate_no_dups(). It means that some transaction
+ identifiers may appear multiple times in ids.
+
+ @param[in,out] caller_trx used to get access to rw_trx_hash_pins
+ @param[out] ids array to store registered transaction identifiers
+ @param[out] max_trx_id variable to store m_max_trx_id value
+ @param[out] mix_trx_no variable to store min(no) value
+ */
+
+ void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
+ trx_id_t *min_trx_no)
+ {
+ snapshot_ids_arg arg(ids);
+
+ while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
+ ut_delay(1);
+ arg.m_no= arg.m_id;
+
+ ids->clear();
+ ids->reserve(rw_trx_hash.size() + 32);
+ rw_trx_hash.iterate(caller_trx, copy_one_id, &arg);
+
+ *max_trx_id= arg.m_id;
+ *min_trx_no= arg.m_no;
+ }
+
+
+ /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
+ void init_max_trx_id(trx_id_t value)
+ {
+ m_max_trx_id= value;
+ m_rw_trx_hash_version.store(value, std::memory_order_relaxed);
+ }
+
+
+ bool is_initialised() const { return m_initialised; }
+
+
+ /** Initialise the transaction subsystem. */
+ void create();
+
+ /** Close the transaction subsystem on shutdown. */
+ void close();
+
+ /** @return total number of active (non-prepared) transactions */
+ size_t any_active_transactions(size_t *prepared= nullptr);
+
+
+ /**
+ Determine the rollback segment identifier.
+
+ @param rseg rollback segment
+ @param persistent whether the rollback segment is persistent
+ @return the rollback segment identifier
+ */
+ unsigned rseg_id(const trx_rseg_t *rseg, bool persistent) const
+ {
+ const trx_rseg_t *array= persistent ? rseg_array : temp_rsegs;
+ ut_ad(rseg >= array);
+ ut_ad(rseg < &array[TRX_SYS_N_RSEGS]);
+ return static_cast<unsigned>(rseg - array);
+ }
+
+
+ /**
+ Registers read-write transaction.
+
+ Transaction becomes visible to MVCC.
+
+ There's a gap between m_max_trx_id increment and transaction becoming
+ visible through rw_trx_hash. While we're in this gap concurrent thread may
+ come and do MVCC snapshot. As a result concurrent read view will be able to
+ observe records owned by this transaction even before it was committed.
+
+ m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+ to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+ means that all transactions up to m_max_trx_id are available through
+ rw_trx_hash.
+
+ We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+ that m_rw_trx_hash_version increment happens after transaction becomes
+ visible through rw_trx_hash.
+ */
+
+ void register_rw(trx_t *trx)
+ {
+ trx->id= get_new_trx_id_no_refresh();
+ rw_trx_hash.insert(trx);
+ refresh_rw_trx_hash_version();
+ }
+
+
+ /**
+ Deregisters read-write transaction.
+
+ Transaction is removed from rw_trx_hash, which releases all implicit locks.
+ MVCC snapshot won't see this transaction anymore.
+ */
+
+ void deregister_rw(trx_t *trx)
+ {
+ rw_trx_hash.erase(trx);
+ }
+
+
+ bool is_registered(trx_t *caller_trx, trx_id_t id)
+ {
+ return id && find(caller_trx, id, false);
+ }
+
+
+ trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true)
+ {
+ return rw_trx_hash.find(caller_trx, id, do_ref_count);
+ }
+
+
+ /**
+ Registers transaction in trx_sys.
+
+ @param trx transaction
+ */
+ void register_trx(trx_t *trx)
+ {
+ trx_list.push_front(*trx);
+ }
+
+
+ /**
+ Deregisters transaction in trx_sys.
+
+ @param trx transaction
+ */
+ void deregister_trx(trx_t *trx)
+ {
+ trx_list.remove(*trx);
+ }
+
+
+ /**
+ Clones the oldest view and stores it in view.
+
+ No need to call ReadView::close(). The caller owns the view that is passed
+ in. This function is called by purge thread to determine whether it should
+ purge the delete marked record or not.
+ */
+ void clone_oldest_view(ReadViewBase *view) const;
+
+
+ /** @return the number of active views */
+ size_t view_count() const
+ {
+ size_t count= 0;
+
+ trx_list.for_each([&count](const trx_t &trx) {
+ if (trx.read_view.is_open())
+ ++count;
+ });
+
+ return count;
+ }
+
+ /** Set the undo log empty value */
+ void set_undo_non_empty(bool val)
+ {
+ if (!undo_log_nonempty)
+ undo_log_nonempty= val;
+ }
+
+ /** Get the undo log empty value */
+ bool is_undo_empty() const { return !undo_log_nonempty; }
+
+ /* Reset the trx_sys page and retain the dblwr information,
+ system rollback segment header page
+ @return error code */
+ inline dberr_t reset_page(mtr_t *mtr);
+private:
+ static my_bool find_same_or_older_callback(rw_trx_hash_element_t *element,
+ trx_id_t *id)
+ {
+ return element->id <= *id;
+ }
+
+
+ struct snapshot_ids_arg
+ {
+ snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
+ trx_ids_t *m_ids;
+ trx_id_t m_id;
+ trx_id_t m_no;
+ };
+
+
+ static my_bool copy_one_id(rw_trx_hash_element_t *element,
+ snapshot_ids_arg *arg)
+ {
+ if (element->id < arg->m_id)
+ {
+ trx_id_t no= element->no;
+ arg->m_ids->push_back(element->id);
+ if (no < arg->m_no)
+ arg->m_no= no;
+ }
+ return 0;
+ }
+
+
+ /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
+ trx_id_t get_rw_trx_hash_version()
+ {
+ return m_rw_trx_hash_version.load(std::memory_order_acquire);
+ }
+
+
+ /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
+ void refresh_rw_trx_hash_version()
+ {
+ m_rw_trx_hash_version.fetch_add(1, std::memory_order_release);
+ }
+
+
+ /**
+ Allocates new transaction id without refreshing rw_trx_hash version.
+
+ This method is extracted for exclusive use by register_rw() and
+ assign_new_trx_no() where new id must be allocated atomically with
+ payload of these methods from MVCC snapshot point of view.
+
+ @sa get_new_trx_id()
+ @sa assign_new_trx_no()
+
+ @return new transaction id
+ */
+
+ trx_id_t get_new_trx_id_no_refresh()
+ {
+ return m_max_trx_id++;
+ }
+};
+
+
+/** The transaction system */
+extern trx_sys_t trx_sys;
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
new file mode 100644
index 00000000..3cfbe331
--- /dev/null
+++ b/storage/innobase/include/trx0trx.h
@@ -0,0 +1,1268 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "trx0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+#include "fts0fts.h"
+#include "read0types.h"
+#include "ilist.h"
+#include "row0merge.h"
+
+#include <vector>
+
+// Forward declaration
+struct mtr_t;
+struct rw_trx_hash_element_t;
+
+/******************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg); /*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file); /*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx); /*!< in: trx object */
+
+/** @return an allocated transaction */
+trx_t *trx_create();
+
+/** At shutdown, frees a transaction object. */
+void trx_free_at_shutdown(trx_t *trx);
+
+/** Disconnect a prepared transaction from MySQL.
+@param[in,out] trx transaction */
+void trx_disconnect_prepared(trx_t *trx);
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start();
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write); /*!< in: true if read write transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*=========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write); /*!< in: true if read write transaction */
+
+/**
+Start a transaction for internal processing.
+@param trx transaction
+@param read_write whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write);
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started_xa(t, rw) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_if_not_started_xa_low((t), rw); \
+ } while (false)
+
+#define trx_start_if_not_started(t, rw) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_if_not_started_low((t), rw); \
+ } while (false)
+
+#define trx_start_internal(t) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_internal_low(t, true); \
+ } while (false)
+#define trx_start_internal_read_only(t) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_internal_low(t, false); \
+ } while (false)
+#else
+#define trx_start_if_not_started(t, rw) \
+ trx_start_if_not_started_low((t), rw)
+
+#define trx_start_internal(t) trx_start_internal_low(t, true)
+#define trx_start_internal_read_only(t) trx_start_internal_low(t, false)
+
+#define trx_start_if_not_started_xa(t, rw) \
+ trx_start_if_not_started_xa_low((t), (rw))
+#endif /* UNIV_DEBUG */
+
+/** Start a transaction for a DDL operation.
+@param trx transaction */
+void trx_start_for_ddl_low(trx_t *trx);
+
+#ifdef UNIV_DEBUG
+# define trx_start_for_ddl(t) \
+ do { \
+ ut_ad((t)->start_file == 0); \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_for_ddl_low(t); \
+ } while (0)
+#else
+# define trx_start_for_ddl(t) trx_start_for_ddl_low(t)
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx); /*!< in/out: transaction */
+/** XA PREPARE a transaction.
+@param[in,out] trx transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx);
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions */
+int
+trx_recover_for_mysql(
+/*==================*/
+ XID* xid_list, /*!< in/out: prepared transactions */
+ uint len); /*!< in: number of slots in xid_list */
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in] xid X/Open XA transaction identifier
+@return transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid);
+/** Durably write log until trx->commit_lsn
+(if trx_t::commit_in_memory() was invoked with flush_log_later=true). */
+void trx_commit_complete_for_mysql(trx_t *trx);
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx); /*!< in: trx handle */
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+ FILE* f,
+ /*!< in: output stream */
+ const trx_t* trx,
+ /*!< in: transaction */
+ ulint max_query_len,
+ /*!< in: max query length to print,
+ or 0 to use the default max length */
+ ulint n_rec_locks,
+ /*!< in: trx->lock.n_rec_locks */
+ ulint n_trx_locks,
+ /*!< in: length of trx->lock.trx_locks */
+ ulint heap_size);
+ /*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+
+/**********************************************************************//**
+Prints info about a transaction.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len); /*!< in: max query length to print,
+ or 0 to use the default max length */
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.latch. */
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len); /*!< in: max query length to print,
+ or 0 to use the default max length */
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+ const trx_t* trx, /*!< in: transaction */
+ trx_state_t state, /*!< in: state;
+ if state != TRX_STATE_NOT_STARTED
+ asserts that
+ trx->state != TRX_STATE_NOT_STARTED */
+ bool relaxed = false)
+ /*!< in: whether to allow
+ trx->state == TRX_STATE_NOT_STARTED
+ after an error has been reported */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return true if interrupted */
+bool
+trx_is_interrupted(
+/*===============*/
+ const trx_t* trx); /*!< in: transaction */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t transaction
+@return transaction weight */
+#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
+
+/** Create the trx_t pool */
+void
+trx_pool_init();
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close();
+
+/**
+Set the transaction as a read-write transaction if it is not already
+tagged as such.
+@param[in,out] trx Transaction that needs to be "upgraded" to RW from RO */
+void
+trx_set_rw_mode(
+ trx_t* trx);
+
+/**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx transaction
+@return lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(t) \
+ ((t)->mysql_thd != NULL \
+ ? thd_lock_wait_timeout((t)->mysql_thd) \
+ : 0)
+
+typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> > lock_list;
+
+/** The locks and state of an active transaction. Protected by
+lock_sys.latch, trx->mutex or both. */
+struct trx_lock_t
+{
+ /** Lock request being waited for.
+ Set to nonnull when holding lock_sys.latch, lock_sys.wait_mutex and
+ trx->mutex, by the thread that is executing the transaction.
+ Set to nullptr when holding lock_sys.wait_mutex. */
+ Atomic_relaxed<lock_t*> wait_lock;
+ /** Transaction being waited for; protected by lock_sys.wait_mutex */
+ trx_t *wait_trx;
+ /** condition variable for !wait_lock; used with lock_sys.wait_mutex */
+ pthread_cond_t cond;
+ /** lock wait start time */
+ Atomic_relaxed<my_hrtime_t> suspend_time;
+
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+ /** 2=high priority WSREP thread has marked this trx to abort;
+ 1=another transaction chose this as a victim in deadlock resolution.
+
+ Other threads than the one that is executing the transaction may set
+ flags in this while holding lock_sys.wait_mutex. */
+ Atomic_relaxed<byte> was_chosen_as_deadlock_victim;
+
+ /** Flag the lock owner as a victim in Galera conflict resolution. */
+ void set_wsrep_victim()
+ {
+# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ /* There is no 8-bit version of the 80386 BTS instruction.
+ Technically, this is the wrong addressing mode (16-bit), but
+ there are other data members stored after the byte. */
+ __asm__ __volatile__("lock btsw $1, %0"
+ : "+m" (was_chosen_as_deadlock_victim));
+# else
+ was_chosen_as_deadlock_victim.fetch_or(2);
+# endif
+ }
+#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+ /** High priority WSREP thread has marked this trx to abort or
+ another transaction chose this as a victim in deadlock resolution.
+
+ Other threads than the one that is executing the transaction may set
+ this while holding lock_sys.wait_mutex. */
+ Atomic_relaxed<bool> was_chosen_as_deadlock_victim;
+
+ /** Flag the lock owner as a victim in Galera conflict resolution. */
+ void set_wsrep_victim() { was_chosen_as_deadlock_victim= true; }
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+ /** Next available rec_pool[] entry */
+ byte rec_cached;
+ /** Next available table_pool[] entry */
+ byte table_cached;
+
+ que_thr_t* wait_thr; /*!< query thread belonging to this
+ trx that is in waiting
+ state. For threads suspended in a
+ lock wait, this is protected by
+ lock_sys.latch. Otherwise, this may
+ only be modified by the thread that is
+ serving the running transaction. */
+
+ /** Pre-allocated record locks */
+ struct {
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) ib_lock_t lock;
+ } rec_pool[8];
+
+ /** Pre-allocated table locks */
+ ib_lock_t table_pool[8];
+
+ /** Memory heap for trx_locks. Protected by lock_sys.assert_locked()
+ and lock_sys.is_writer() || trx->mutex_is_owner(). */
+ mem_heap_t *lock_heap;
+
+ /** Locks held by the transaction. Protected by lock_sys.assert_locked()
+ and lock_sys.is_writer() || trx->mutex_is_owner().
+ (If lock_sys.latch is only held in shared mode, then the modification
+ must be protected by trx->mutex.) */
+ trx_lock_list_t trx_locks;
+
+ lock_list table_locks; /*!< All table locks requested by this
+ transaction, including AUTOINC locks */
+
+ /** List of pending trx_t::evict_table() */
+ UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables;
+
+ /** number of record locks; protected by lock_sys.assert_locked(page_id) */
+ ulint n_rec_locks;
+};
+
+/** Logical first modification time of a table in a transaction */
+class trx_mod_table_time_t
+{
+ /** Impossible value for trx_t::undo_no */
+ static constexpr undo_no_t NONE= ~undo_no_t{0};
+ /** Theoretical maximum value for trx_t::undo_no.
+ DB_ROLL_PTR is only 7 bytes, so it cannot point to more than
+ this many undo log records. */
+ static constexpr undo_no_t LIMIT= (undo_no_t{1} << (7 * 8)) - 1;
+
+ /** Flag in 'first' to indicate that subsequent operations are
+ covered by a TRX_UNDO_EMPTY record (for the first statement to
+ insert into an empty table) */
+ static constexpr undo_no_t BULK= 1ULL << 63;
+
+ /** First modification of the table, possibly ORed with BULK */
+ undo_no_t first;
+ /** First modification of a system versioned column
+ (NONE= no versioning, BULK= the table was dropped) */
+ undo_no_t first_versioned= NONE;
+#ifdef UNIV_DEBUG
+ /** Whether the modified table is a FTS auxiliary table */
+ bool fts_aux_table= false;
+#endif /* UNIV_DEBUG */
+
+ /** Buffer to store insert opertion */
+ row_merge_bulk_t *bulk_store= nullptr;
+
+ friend struct trx_t;
+public:
+ /** Constructor
+ @param rows number of modified rows so far */
+ trx_mod_table_time_t(undo_no_t rows) : first(rows) { ut_ad(rows < LIMIT); }
+
+#ifdef UNIV_DEBUG
+ /** Validation
+ @param rows number of modified rows so far
+ @return whether the object is valid */
+ bool valid(undo_no_t rows= NONE) const
+ { auto f= first & LIMIT; return f <= first_versioned && f <= rows; }
+#endif /* UNIV_DEBUG */
+ /** @return if versioned columns were modified */
+ bool is_versioned() const { return (~first_versioned & LIMIT) != 0; }
+ /** @return if the table was dropped */
+ bool is_dropped() const { return first_versioned == BULK; }
+
+ /** After writing an undo log record, set is_versioned() if needed
+ @param rows number of modified rows so far */
+ void set_versioned(undo_no_t rows)
+ {
+ ut_ad(first_versioned == NONE);
+ first_versioned= rows;
+ ut_ad(valid(rows));
+ }
+
+ /** After writing an undo log record, note that the table will be dropped */
+ void set_dropped()
+ {
+ ut_ad(first_versioned == NONE);
+ first_versioned= BULK;
+ }
+
+ /** Notify the start of a bulk insert operation
+ @param table table to do bulk operation */
+ void start_bulk_insert(dict_table_t *table)
+ {
+ first|= BULK;
+ if (!table->is_temporary())
+ bulk_store= new row_merge_bulk_t(table);
+ }
+
+ /** Notify the end of a bulk insert operation */
+ void end_bulk_insert() { first&= ~BULK; }
+
+ /** @return whether an insert is covered by TRX_UNDO_EMPTY record */
+ bool is_bulk_insert() const { return first & BULK; }
+
+ /** Invoked after partial rollback
+ @param limit number of surviving modified rows (trx_t::undo_no)
+ @return whether this should be erased from trx_t::mod_tables */
+ bool rollback(undo_no_t limit)
+ {
+ ut_ad(valid());
+ if ((LIMIT & first) >= limit)
+ return true;
+ if (first_versioned < limit)
+ first_versioned= NONE;
+ return false;
+ }
+
+#ifdef UNIV_DEBUG
+ void set_aux_table() { fts_aux_table= true; }
+
+ bool is_aux_table() const { return fts_aux_table; }
+#endif /* UNIV_DEBUG */
+
+ /** @return the first undo record that modified the table */
+ undo_no_t get_first() const
+ {
+ ut_ad(valid());
+ return LIMIT & first;
+ }
+
+ /** Add the tuple to the transaction bulk buffer for the given index.
+ @param entry tuple to be inserted
+ @param index bulk insert for the index
+ @param trx transaction */
+ dberr_t bulk_insert_buffered(const dtuple_t &entry,
+ const dict_index_t &index, trx_t *trx)
+ {
+ return bulk_store->bulk_insert_buffered(entry, index, trx);
+ }
+
+ /** Do bulk insert operation present in the buffered operation
+ @return DB_SUCCESS or error code */
+ dberr_t write_bulk(dict_table_t *table, trx_t *trx);
+
+ /** @return whether the buffer storage exist */
+ bool bulk_buffer_exist() const
+ {
+ return bulk_store && is_bulk_insert();
+ }
+
+ /** Free bulk insert operation */
+ void clear_bulk_buffer()
+ {
+ delete bulk_store;
+ bulk_store= nullptr;
+ }
+};
+
+/** Collection of persistent tables and their first modification
+in a transaction.
+We store pointers to the table objects in memory because
+we know that a table object will not be destroyed while a transaction
+that modified it is running. */
+typedef std::map<
+ dict_table_t*, trx_mod_table_time_t,
+ std::less<dict_table_t*>,
+ ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > >
+ trx_mod_tables_t;
+
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_sys.latch before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have pthread_self() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_recovered() may access resurrected (connectionless)
+transactions (state == TRX_STATE_ACTIVE && is_recovered)
+while the system is already processing new user transactions (!is_recovered).
+
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding lock_sys.latch.
+
+* When a transaction handle is in the trx_sys.trx_list, some of its fields
+must not be modified without holding trx->mutex.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys.latch (insertions also by trx->mutex). */
+
+/** Represents an instance of rollback segment along with its state variables.*/
+struct trx_undo_ptr_t {
+ trx_rseg_t* rseg; /*!< rollback segment assigned to the
+ transaction, or NULL if not assigned
+ yet */
+ trx_undo_t* undo; /*!< pointer to the undo log, or
+ NULL if nothing logged yet */
+};
+
+/** An instance of temporary rollback segment. */
+struct trx_temp_undo_t {
+ /** temporary rollback segment, or NULL if not assigned yet */
+ trx_rseg_t* rseg;
+ /** pointer to the undo log, or NULL if nothing logged yet */
+ trx_undo_t* undo;
+};
+
+/** Rollback segments assigned to a transaction for undo logging. */
+struct trx_rsegs_t {
+ /** undo log ptr holding reference to a rollback segment that resides in
+ system/undo tablespace used for undo logging of tables that needs
+ to be recovered on crash. */
+ trx_undo_ptr_t m_redo;
+
+ /** undo log for temporary tables; discarded immediately after
+ transaction commit/rollback */
+ trx_temp_undo_t m_noredo;
+};
+
+struct trx_t : ilist_node<>
+{
+private:
+ /**
+ Least significant 31 bits is count of references.
+
+ We can't release the locks nor commit the transaction until this reference
+ is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify
+ that it is no longer "active".
+
+ If the most significant bit is set this transaction should stop inheriting
+ (GAP)locks. Generally set to true during transaction prepare for RC or lower
+ isolation, if requested. Needed for replication replay where
+ we don't want to get blocked on GAP locks taken for protecting
+ concurrent unique insert or replace operation.
+ */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+ Atomic_relaxed<uint32_t> skip_lock_inheritance_and_n_ref;
+
+
+public:
+ /** Transaction identifier (0 if no locks were acquired).
+ Set by trx_sys_t::register_rw() or trx_resurrect() before
+ the transaction is added to trx_sys.rw_trx_hash.
+ Cleared in commit_in_memory() after commit_state(),
+ trx_sys_t::deregister_rw(), release_locks(). */
+ trx_id_t id;
+ /** The largest encountered transaction identifier for which no
+ transaction was observed to be active. This is a cache to speed up
+ trx_sys_t::find_same_or_older(). */
+ trx_id_t max_inactive_id;
+
+private:
+ /** mutex protecting state and some of lock
+ (some are protected by lock_sys.latch) */
+ srw_spin_mutex mutex;
+#ifdef UNIV_DEBUG
+ /** The owner of mutex (0 if none); protected by mutex */
+ std::atomic<pthread_t> mutex_owner{0};
+#endif /* UNIV_DEBUG */
+public:
+ void mutex_init() { mutex.init(); }
+ void mutex_destroy() { mutex.destroy(); }
+
+ /** Acquire the mutex */
+ void mutex_lock()
+ {
+ ut_ad(!mutex_is_owner());
+ mutex.wr_lock();
+ ut_ad(!mutex_owner.exchange(pthread_self(),
+ std::memory_order_relaxed));
+ }
+ /** Release the mutex */
+ void mutex_unlock()
+ {
+ ut_ad(mutex_owner.exchange(0, std::memory_order_relaxed)
+ == pthread_self());
+ mutex.wr_unlock();
+ }
+#ifndef SUX_LOCK_GENERIC
+ bool mutex_is_locked() const noexcept { return mutex.is_locked(); }
+#endif
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread holds the mutex */
+ bool mutex_is_owner() const
+ {
+ return mutex_owner.load(std::memory_order_relaxed) ==
+ pthread_self();
+ }
+#endif /* UNIV_DEBUG */
+
+ /** State of the trx from the point of view of concurrency control
+ and the valid state transitions.
+
+ Possible states:
+
+ TRX_STATE_NOT_STARTED
+ TRX_STATE_ACTIVE
+ TRX_STATE_PREPARED
+ TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
+ TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+ Valid state transitions are:
+
+ Regular transactions:
+ * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+ Auto-commit non-locking read-only:
+ * NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+ XA (2PC):
+ * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+ Recovered XA:
+ * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+ Recovered XA followed by XA ROLLBACK:
+ * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
+
+ XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
+ * NOT_STARTED -> PREPARED -> (freed)
+
+ Disconnected XA PREPARE transaction can become recovered:
+ * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
+
+ Latching and various transaction lists membership rules:
+
+ XA (2PC) transactions are always treated as non-autocommit.
+
+ Transitions to ACTIVE or NOT_STARTED occur when transaction
+ is not in rw_trx_hash.
+
+ Autocommit non-locking read-only transactions move between states
+ without holding any mutex. They are not in rw_trx_hash.
+
+ All transactions, unless they are determined to be ac-nl-ro,
+ explicitly tagged as read-only or read-write, will first be put
+ on the read-only transaction list. Only when a !read-only transaction
+ in the read-only list tries to acquire an X or IX lock on a table
+ do we remove it from the read-only list and put it on the read-write
+ list. During this switch we assign it a rollback segment.
+
+ When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
+ in rw_trx_hash.
+
+ ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
+ The transition ACTIVE->PREPARED is protected by trx->mutex.
+
+ ACTIVE->COMMITTED is possible when the transaction is in
+ rw_trx_hash.
+
+ Transitions to COMMITTED are protected by trx_t::mutex. */
+ Atomic_relaxed<trx_state_t> state;
+
+ /** The locks of the transaction. Protected by lock_sys.latch
+ (insertions also by trx_t::mutex). */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_lock_t lock;
+
+#ifdef WITH_WSREP
+ /** whether wsrep_on(mysql_thd) held at the start of transaction */
+ byte wsrep;
+ bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
+ bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep & 2); }
+#else /* WITH_WSREP */
+ bool is_wsrep() const { return false; }
+#endif /* WITH_WSREP */
+
+ /** Consistent read view of the transaction */
+ ReadView read_view;
+
+ /* These fields are not protected by any mutex. */
+
+ /** false=normal transaction, true=recovered (must be rolled back)
+ or disconnected transaction in XA PREPARE STATE.
+
+ This field is accessed by the thread that owns the transaction,
+ without holding any mutex.
+ There is only one foreign-thread access in trx_print_low()
+ and a possible race condition with trx_disconnect_prepared(). */
+ bool is_recovered;
+ const char* op_info; /*!< English text describing the
+ current operation, or an empty
+ string */
+ uint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+ bool check_foreigns; /*!< normally TRUE, but if the user
+ wants to suppress foreign key checks,
+ (in table imports, for example) we
+ set this FALSE */
+ /** whether an insert into an empty table is active */
+ bool bulk_insert;
+ /*------------------------------*/
+ /* MySQL has a transaction coordinator to coordinate two phase
+ commit between multiple storage engines and the binary log. When
+ an engine participates in a transaction, it's responsible for
+ registering itself using the trans_register_ha() API. */
+ bool is_registered; /* This flag is set to true after the
+ transaction has been registered with
+ the coordinator using the XA API, and
+ is set to false after commit or
+ rollback. */
+ /** whether this is holding the prepare mutex */
+ bool active_commit_ordered;
+ /*------------------------------*/
+ bool check_unique_secondary;
+ /*!< normally TRUE, but if the user
+ wants to speed up inserts by
+ suppressing unique key checks
+ for secondary indexes when we decide
+ if we can use the insert buffer for
+ them, we set this FALSE */
+ bool flush_log_later;/* In 2PC, we hold the
+ prepare_commit mutex across
+ both phases. In that case, we
+ defer flush of the logs to disk
+ until after we release the
+ mutex. */
+ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+ /** whether this modifies InnoDB dictionary tables */
+ bool dict_operation;
+#ifdef UNIV_DEBUG
+ /** copy of dict_operation during commit() */
+ bool was_dict_operation;
+#endif
+ /** whether dict_sys.latch is held exclusively; protected by
+ dict_sys.latch */
+ bool dict_operation_lock_mode;
+
+ /** wall-clock time of the latest transition to TRX_STATE_ACTIVE;
+ used for diagnostic purposes only */
+ time_t start_time;
+ /** microsecond_interval_timer() of transaction start */
+ ulonglong start_time_micro;
+ lsn_t commit_lsn; /*!< lsn at the time of the commit */
+ /*------------------------------*/
+ THD* mysql_thd; /*!< MySQL thread handle corresponding
+ to this trx, or NULL */
+
+ const char* mysql_log_file_name;
+ /*!< if MySQL binlog is used, this field
+ contains a pointer to the latest file
+ name; this is NULL if binlog is not
+ used */
+ ulonglong mysql_log_offset;
+ /*!< if MySQL binlog is used, this
+ field contains the end offset of the
+ binlog entry */
+ /*------------------------------*/
+ ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables
+ used in the processing of the current
+ SQL statement in MySQL */
+ ib_uint32_t mysql_n_tables_locked;
+ /*!< how many tables the current SQL
+ statement uses, except those
+ in consistent read */
+
+ /** DB_SUCCESS or error code; usually only the thread that is running
+ the transaction is allowed to modify this field. The only exception is
+ when a thread invokes lock_sys_t::cancel() in order to abort a
+ lock_wait(). That is protected by lock_sys.wait_mutex and lock.wait_lock. */
+ dberr_t error_state;
+
+ const dict_index_t*error_info; /*!< if the error number indicates a
+ duplicate key error, a pointer to
+ the problematic index is stored here */
+ ulint error_key_num; /*!< if the index creation fails to a
+ duplicate key error, a mysql key
+ number of that index is stored here */
+ que_t* graph; /*!< query currently run in the session,
+ or NULL if none; NOTE that the query
+ belongs to the session, and it can
+ survive over a transaction commit, if
+ it is a stored procedure with a COMMIT
+ WORK statement, for instance */
+ /*------------------------------*/
+ UT_LIST_BASE_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< savepoints set with SAVEPOINT ...,
+ oldest first */
+ /*------------------------------*/
+ undo_no_t undo_no; /*!< next undo log record number to
+ assign; since the undo log is
+ private for a transaction, this
+ is a simple ascending sequence
+ with no gaps; thus it represents
+ the number of modified/inserted
+ rows in a transaction */
+ trx_savept_t last_sql_stat_start;
+ /*!< undo_no when the last sql statement
+ was started: in case of an error, trx
+ is rolled back down to this number */
+ trx_rsegs_t rsegs; /* rollback segments for undo logging */
+ undo_no_t roll_limit; /*!< least undo number to undo during
+ a partial rollback; 0 otherwise */
+ bool in_rollback; /*!< true when the transaction is
+ executing a partial or full rollback */
+ ulint pages_undone; /*!< number of undo log pages undone
+ since the last undo log truncation */
+ /*------------------------------*/
+ ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for
+ an SQL statement. This is useful for
+ multi-row INSERTs */
+ ib_vector_t* autoinc_locks; /* AUTOINC locks held by this
+ transaction. Note that these are
+ also in the lock list trx_locks. This
+ vector needs to be freed explicitly
+ when the trx instance is destroyed.
+ Protected by lock_sys.latch. */
+ /*------------------------------*/
+ bool read_only; /*!< true if transaction is flagged
+ as a READ-ONLY transaction.
+ if auto_commit && !will_lock
+ then it will be handled as a
+ AC-NL-RO-SELECT (Auto Commit Non-Locking
+ Read Only Select). A read only
+ transaction will not be assigned an
+ UNDO log. */
+ bool auto_commit; /*!< true if it is an autocommit */
+ bool will_lock; /*!< set to inform trx_start_low() that
+ the transaction may acquire locks */
+ /* True if transaction has to read the undo log and
+ log the DML changes for online DDL table */
+ bool apply_online_log = false;
+
+ /*------------------------------*/
+ fts_trx_t* fts_trx; /*!< FTS information, or NULL if
+ transaction hasn't modified tables
+ with FTS indexes (yet). */
+ doc_id_t fts_next_doc_id;/* The document id used for updates */
+ /*------------------------------*/
+ ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES",
+ count of tables being flushed. */
+
+ /*------------------------------*/
+#ifdef UNIV_DEBUG
+ unsigned start_line; /*!< Track where it was started from */
+ const char* start_file; /*!< Filename where it was started */
+#endif /* UNIV_DEBUG */
+
+ XID xid; /*!< X/Open XA transaction
+ identification to identify a
+ transaction branch */
+ trx_mod_tables_t mod_tables; /*!< List of tables that were modified
+ by this transaction */
+ /*------------------------------*/
+ char* detailed_error; /*!< detailed error message for last
+ error, or empty. */
+ rw_trx_hash_element_t *rw_trx_hash_element;
+ LF_PINS *rw_trx_hash_pins;
+ ulint magic_n;
+
+ /** @return whether any persistent undo log has been generated */
+ bool has_logged_persistent() const
+ {
+ return(rsegs.m_redo.undo);
+ }
+
+ /** @return whether any undo log has been generated */
+ bool has_logged() const
+ {
+ return(has_logged_persistent() || rsegs.m_noredo.undo);
+ }
+
+ /** @return rollback segment for modifying temporary tables */
+ trx_rseg_t* get_temp_rseg()
+ {
+ if (trx_rseg_t* rseg = rsegs.m_noredo.rseg) {
+ ut_ad(id != 0);
+ return(rseg);
+ }
+
+ return(assign_temp_rseg());
+ }
+
+ /** Transition to committed state, to release implicit locks. */
+ inline void commit_state();
+
+ /** Release any explicit locks of a committing transaction. */
+ inline void release_locks();
+
+ /** Evict a table definition due to the rollback of ALTER TABLE.
+ @param table_id table identifier
+ @param reset_only whether to only reset dict_table_t::def_trx_id */
+ void evict_table(table_id_t table_id, bool reset_only= false);
+
+ /** Initiate rollback.
+ @param savept savepoint to which to roll back
+ @return error code or DB_SUCCESS */
+ dberr_t rollback(trx_savept_t *savept= nullptr);
+ /** Roll back an active transaction.
+ @param savept savepoint to which to roll back */
+ inline void rollback_low(trx_savept_t *savept= nullptr);
+ /** Finish rollback.
+ @return whether the rollback was completed normally
+ @retval false if the rollback was aborted by shutdown */
+ inline bool rollback_finish();
+private:
+ /** Apply any changes to tables for which online DDL is in progress. */
+ ATTRIBUTE_COLD void apply_log();
+ /** Process tables that were modified by the committing transaction. */
+ inline void commit_tables();
+ /** Mark a transaction committed in the main memory data structures.
+ @param mtr mini-transaction (if there are any persistent modifications) */
+ inline void commit_in_memory(const mtr_t *mtr);
+ /** Write log for committing the transaction. */
+ void commit_persist();
+ /** Clean up the transaction after commit_in_memory() */
+ void commit_cleanup();
+ /** Commit the transaction in a mini-transaction.
+ @param mtr mini-transaction (if there are any persistent modifications) */
+ void commit_low(mtr_t *mtr= nullptr);
+ /** Commit an empty transaction.
+ @param mtr mini-transaction */
+ void commit_empty(mtr_t *mtr);
+ /** Commit an empty transaction.
+ @param mtr mini-transaction */
+ /** Assign the transaction its history serialisation number and write the
+ UNDO log to the assigned rollback segment.
+ @param mtr mini-transaction */
+ inline void write_serialisation_history(mtr_t *mtr);
+public:
+ /** Commit the transaction. */
+ void commit();
+
+ /** Try to drop a persistent table.
+ @param table persistent table
+ @param fk whether to drop FOREIGN KEY metadata
+ @return error code */
+ dberr_t drop_table(const dict_table_t &table);
+ /** Try to drop the foreign key constraints for a persistent table.
+ @param name name of persistent table
+ @return error code */
+ dberr_t drop_table_foreign(const table_name_t &name);
+ /** Try to drop the statistics for a persistent table.
+ @param name name of persistent table
+ @return error code */
+ dberr_t drop_table_statistics(const table_name_t &name);
+ /** Commit the transaction, possibly after drop_table().
+ @param deleted handles of data files that were deleted */
+ void commit(std::vector<pfs_os_file_t> &deleted);
+
+
+ /** Discard all savepoints */
+ void savepoints_discard()
+ { savepoints_discard(UT_LIST_GET_FIRST(trx_savepoints)); }
+
+
+ /** Discard all savepoints starting from a particular savepoint.
+ @param savept first savepoint to discard */
+ void savepoints_discard(trx_named_savept_t *savept);
+
+
+ bool is_referenced() const
+ {
+ return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0;
+ }
+
+
+ void reference()
+ {
+ ut_d(auto old_n_ref =)
+ skip_lock_inheritance_and_n_ref.fetch_add(1);
+ ut_ad(int32_t(old_n_ref << 1) >= 0);
+ }
+
+ void release_reference()
+ {
+ ut_d(auto old_n_ref =)
+ skip_lock_inheritance_and_n_ref.fetch_sub(1);
+ ut_ad(int32_t(old_n_ref << 1) > 0);
+ }
+
+ bool is_not_inheriting_locks() const
+ {
+ return skip_lock_inheritance_and_n_ref >> 31;
+ }
+
+ void set_skip_lock_inheritance()
+ {
+ ut_d(auto old_n_ref=) skip_lock_inheritance_and_n_ref.fetch_add(1U << 31);
+ ut_ad(!(old_n_ref >> 31));
+ }
+
+ void reset_skip_lock_inheritance()
+ {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ __asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ _interlockedbittestandreset(
+ reinterpret_cast<volatile long *>(&skip_lock_inheritance_and_n_ref),
+ 31);
+#else
+ skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31);
+#endif
+ }
+
+ /** @return whether the table has lock on
+ mysql.innodb_table_stats or mysql.innodb_index_stats */
+ bool has_stats_table_lock() const;
+
+ /** Free the memory to trx_pools */
+ void free();
+
+
+ void assert_freed() const
+ {
+ ut_ad(state == TRX_STATE_NOT_STARTED);
+ ut_ad(!id);
+ ut_ad(!mutex_is_owner());
+ ut_ad(!has_logged());
+ ut_ad(!is_referenced());
+ ut_ad(!is_wsrep());
+ ut_ad(!lock.was_chosen_as_deadlock_victim);
+ ut_ad(mod_tables.empty());
+ ut_ad(!read_view.is_open());
+ ut_ad(!lock.wait_thr);
+ ut_ad(!lock.wait_lock);
+ ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+ ut_ad(lock.table_locks.empty());
+ ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks));
+ ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+ ut_ad(!dict_operation);
+ ut_ad(!apply_online_log);
+ ut_ad(!is_not_inheriting_locks());
+ ut_ad(check_foreigns);
+ ut_ad(check_unique_secondary);
+ }
+
+ /** This has to be invoked on SAVEPOINT or at the end of a statement.
+ Even if a TRX_UNDO_EMPTY record was written for this table to cover an
+ insert into an empty table, subsequent operations will have to be covered
+ by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+ rollback to the start of a statement will work.
+ @param table table on which any preceding bulk insert ended */
+ void end_bulk_insert(const dict_table_t &table)
+ {
+ auto it= mod_tables.find(const_cast<dict_table_t*>(&table));
+ if (it != mod_tables.end())
+ it->second.end_bulk_insert();
+ }
+
+ /** @return whether this is a non-locking autocommit transaction */
+ bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
+
+ /** This has to be invoked on SAVEPOINT or at the start of a statement.
+ Even if TRX_UNDO_EMPTY records were written for any table to cover an
+ insert into an empty table, subsequent operations will have to be covered
+ by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+ rollback to the start of a statement will work. */
+ void end_bulk_insert()
+ {
+ for (auto& t : mod_tables)
+ t.second.end_bulk_insert();
+ }
+
+ /** @return whether a bulk insert into empty table is in progress */
+ bool is_bulk_insert() const
+ {
+ if (!bulk_insert || check_unique_secondary || check_foreigns)
+ return false;
+ for (const auto& t : mod_tables)
+ if (t.second.is_bulk_insert())
+ return true;
+ return false;
+ }
+
+ /** @return logical modification time of a table only
+ if the table has bulk buffer exist in the transaction */
+ trx_mod_table_time_t *check_bulk_buffer(dict_table_t *table)
+ {
+ if (UNIV_LIKELY(!bulk_insert))
+ return nullptr;
+ ut_ad(!check_unique_secondary);
+ ut_ad(!check_foreigns);
+ auto it= mod_tables.find(table);
+ if (it == mod_tables.end() || !it->second.bulk_buffer_exist())
+ return nullptr;
+ return &it->second;
+ }
+
+ /** Do the bulk insert for the buffered insert operation
+ for the transaction.
+ @return DB_SUCCESS or error code */
+ dberr_t bulk_insert_apply()
+ {
+ return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS;
+ }
+
+private:
+ /** Apply the buffered bulk inserts. */
+ dberr_t bulk_insert_apply_low();
+
+ /** Assign a rollback segment for modifying temporary tables.
+ @return the assigned rollback segment */
+ trx_rseg_t *assign_temp_rseg();
+};
+
+/**
+Check if transaction is started.
+@param[in] trx Transaction whose state we need to check
+@reutrn true if transaction is in state started */
+inline bool trx_is_started(const trx_t* trx)
+{
+ return trx->state != TRX_STATE_NOT_STARTED;
+}
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking
+ SELECTs are performed so that
+ we do not look at a possible
+ earlier version of a record;
+ thus they are not 'consistent'
+ reads under this isolation
+ level; otherwise like level
+ 2 */
+
+#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like
+ isolation, except that in
+ range UPDATE and DELETE we
+ must block phantom rows
+ with next-key locks;
+ SELECT ... FOR UPDATE and ...
+ LOCK IN SHARE MODE only lock
+ the index records, NOT the
+ gaps before them, and thus
+ allow free inserting;
+ each consistent read reads its
+ own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ 2 /* this is the default;
+ all consistent reads in the
+ same trx read the same
+ snapshot;
+ full next-key locking used
+ in locking reads to block
+ insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are
+ converted to LOCK IN SHARE
+ MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE 1U /* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE 2U /* duplicate rows are to be replaced */
+
+
+/** Commit node states */
+enum commit_node_state {
+ COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to
+ the transaction */
+ COMMIT_NODE_WAIT /*!< commit signal sent to the transaction,
+ waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_COMMIT */
+ enum commit_node_state
+ state; /*!< node execution state */
+};
+
+
+#include "trx0trx.inl"
+
+#endif
diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl
new file mode 100644
index 00000000..b063c920
--- /dev/null
+++ b/storage/innobase/include/trx0trx.inl
@@ -0,0 +1,86 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+ const trx_t* trx, /*!< in: transaction */
+ trx_state_t state, /*!< in: state;
+ if state != TRX_STATE_NOT_STARTED
+ asserts that
+ trx->state != TRX_STATE_NOT_STARTED */
+ bool relaxed)
+ /*!< in: whether to allow
+ trx->state == TRX_STATE_NOT_STARTED
+ after an error has been reported */
+{
+#ifdef UNIV_DEBUG
+ switch (trx->state) {
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ ut_ad(!trx->is_autocommit_non_locking());
+ return(trx->state == state);
+
+ case TRX_STATE_ACTIVE:
+ if (trx->is_autocommit_non_locking()) {
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->read_only);
+ ut_ad(trx->mysql_thd);
+ }
+ return(state == trx->state);
+
+ case TRX_STATE_NOT_STARTED:
+ /* These states are not allowed for running transactions. */
+ ut_a(state == TRX_STATE_NOT_STARTED
+ || (relaxed
+ && thd_get_error_number(trx->mysql_thd)));
+
+ return(true);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(trx->state == state);
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx) /*!< in: trx object */
+{
+ return(trx->error_info);
+}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
new file mode 100644
index 00000000..bfa2adc0
--- /dev/null
+++ b/storage/innobase/include/trx0types.h
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+#include "ut0new.h"
+
+#include <vector>
+
+/** printf(3) format used for printing DB_TRX_ID and other system fields */
+#define TRX_ID_FMT IB_ID_FMT
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+static const ulint TRX_ID_MAX_LEN = 17;
+
+/** Space id of the transaction system page (the system tablespace) */
+static constexpr uint32_t TRX_SYS_SPACE= 0;
+
+/** Page number of the transaction system page */
+#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
+
+/** Random value to check for corruption of trx_t */
+static const ulint TRX_MAGIC_N = 91118598;
+
+constexpr uint innodb_purge_threads_MAX= 32;
+constexpr uint innodb_purge_batch_size_MAX= 5000;
+
+/** Transaction states (trx_t::state) */
+enum trx_state_t {
+ TRX_STATE_NOT_STARTED,
+
+ TRX_STATE_ACTIVE,
+ /** XA PREPARE has been executed; only XA COMMIT or XA ROLLBACK
+ are possible */
+ TRX_STATE_PREPARED,
+ /** XA PREPARE transaction that was returned to ha_recover() */
+ TRX_STATE_PREPARED_RECOVERED,
+ TRX_STATE_COMMITTED_IN_MEMORY
+};
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+struct trx_t;
+/** The locks and state of an active transaction */
+struct trx_lock_t;
+/** Rollback segment */
+struct trx_rseg_t;
+/** Transaction undo log */
+struct trx_undo_t;
+/** Rollback command node in a query graph */
+struct roll_node_t;
+/** Commit command node in a query graph */
+struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+struct trx_named_savept_t;
+/* @} */
+
+/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */
+typedef ib_id_t row_id_t;
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef ib_id_t trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef ib_id_t roll_ptr_t;
+/** Undo number */
+typedef ib_id_t undo_no_t;
+
+/** Transaction savepoint */
+struct trx_savept_t{
+ undo_no_t least_undo_no; /*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Undo segment header */
+typedef byte trx_usegf_t;
+/** Undo log header */
+typedef byte trx_ulogf_t;
+/** Undo log page header */
+typedef byte trx_upagef_t;
+
+/** Undo log record */
+typedef byte trx_undo_rec_t;
+
+/* @} */
+
+/** Info required to purge a record */
+struct trx_purge_rec_t
+{
+ /** Undo log record, or nullptr (roll_ptr!=0 if the log can be skipped) */
+ const trx_undo_rec_t *undo_rec;
+ /** File pointer to undo_rec */
+ roll_ptr_t roll_ptr;
+};
+
+typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t;
+
+/** Number of std::unordered_map hash buckets expected to be needed
+for table IDs in a purge batch. GNU libstdc++ would default to 1 and
+enlarge and rehash on demand. */
+static constexpr size_t TRX_PURGE_TABLE_BUCKETS= 128;
+
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
+static constexpr unsigned TRX_SYS_N_RSEGS= 128;
+/** Maximum number of undo tablespaces (not counting the system tablespace) */
+static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1;
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
new file mode 100644
index 00000000..3d22a33e
--- /dev/null
+++ b/storage/innobase/include/trx0undo.h
@@ -0,0 +1,514 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "trx0sys.h"
+
+/** The LSB of the "is insert" flag in DB_ROLL_PTR */
+#define ROLL_PTR_INSERT_FLAG_POS 55
+/** The LSB of the 7-bit trx_rseg_t::id in DB_ROLL_PTR */
+#define ROLL_PTR_RSEG_ID_POS 48
+/** The LSB of the 32-bit undo log page number in DB_ROLL_PTR */
+#define ROLL_PTR_PAGE_POS 16
+/** The LSB of the 16-bit byte offset within an undo log page in DB_ROLL_PTR */
+#define ROLL_PTR_BYTE_POS 0
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ bool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ uint32_t page_no, /*!< in: page number */
+ uint16_t offset); /*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ bool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ uint32_t* page_no, /*!< out: page number */
+ uint16_t* offset); /*!< out: offset of the undo
+ entry within page */
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr); /*!< in: roll pointer */
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+ MY_ATTRIBUTE((warn_unused_result));
+/** Write DB_ROLL_PTR.
+@param[out] ptr buffer
+@param[in] roll_ptr DB_ROLL_PTR value */
+inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr)
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ mach_write_to_7(ptr, roll_ptr);
+}
+/** Read DB_ROLL_PTR.
+@param[in] ptr buffer
+@return roll ptr */
+inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ return mach_read_from_7(ptr);
+}
+
+/** Get the next record in an undo log.
+@param[in] undo_page undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+ uint32_t page_no, uint16_t offset);
+/** Get the previous record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, bool shared, mtr_t *mtr);
+
+/** Get the first undo log record on a page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to first record
+@retval nullptr if none exists */
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset);
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out] block undo log page */
+void trx_undo_page_init(const buf_block_t &block);
+
+/** Allocate an undo log page.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any page latch
+@param[out] err error code
+@return X-latched block if success
+@retval nullptr on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any undo log page
+ or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx);
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out] rseg rollback segment
+@param[in] hdr_page_no header page number
+@param[in] hdr_offset header offset on the page
+@param[in] limit first undo number to preserve
+(everything below the limit will be truncated)
+@return error code */
+dberr_t
+trx_undo_truncate_start(
+ trx_rseg_t* rseg,
+ uint32_t hdr_page_no,
+ uint16_t hdr_offset,
+ undo_no_t limit)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Mark that an undo log header belongs to a data dictionary transaction.
+@param[in] trx dictionary transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr);
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@tparam is_temp whether this is temporary undo log
+@param[in,out] trx transaction
+@param[in] rseg rollback segment
+@param[out] undo the undo log
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return the undo log block
+@retval nullptr on error */
+template<bool is_temp>
+buf_block_t*
+trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+ mtr_t *mtr, dberr_t *err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out] trx transaction
+@param[in,out] undo undo log
+@param[in] rollback false=XA PREPARE, true=XA ROLLBACK
+@param[in,out] mtr mini-transaction */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** At shutdown, frees the undo logs of a transaction. */
+void
+trx_undo_free_at_shutdown(trx_t *trx);
+
+/** Read an undo log when starting up the database.
+@param[in,out] rseg rollback segment
+@param[in] id rollback segment slot
+@param[in] page_no undo log segment page number
+@return the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** the only rollback segment type since MariaDB 10.3.1 */
+constexpr uint16_t TRX_UNDO_UPDATE= 2;
+/* TRX_UNDO_STATE values of an undo log segment */
+/** contains an undo log of an active transaction */
+constexpr uint16_t TRX_UNDO_ACTIVE = 1;
+/** cached for quick reuse */
+constexpr uint16_t TRX_UNDO_CACHED = 2;
+/** can be freed in purge when all undo data in it is removed */
+constexpr uint16_t TRX_UNDO_TO_PURGE = 4;
+/** contains an undo log of a prepared transaction */
+constexpr uint16_t TRX_UNDO_PREPARED = 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Transaction undo log memory object; modified by the thread associated
+with the transaction. */
+
+struct trx_undo_t {
+ /*-----------------------------*/
+ ulint id; /*!< undo log slot number within the
+ rollback segment */
+ ulint state; /*!< state of the corresponding undo log
+ segment */
+ trx_id_t trx_id; /*!< id of the trx assigned to the undo
+ log */
+ XID xid; /*!< X/Open XA transaction
+ identification */
+ bool dict_operation; /*!< TRUE if a dict operation trx */
+ trx_rseg_t* rseg; /*!< rseg where the undo log belongs */
+ /*-----------------------------*/
+ uint32_t hdr_page_no; /*!< page number of the header page in
+ the undo log */
+ uint32_t last_page_no; /*!< page number of the last page in the
+ undo log; this may differ from
+ top_page_no during a rollback */
+ uint16_t hdr_offset; /*!< header offset of the undo log on
+ the page */
+ uint32_t size; /*!< current size in pages */
+ /*-----------------------------*/
+ uint32_t top_page_no; /*!< page number where the latest undo
+ log record was catenated; during
+ rollback the page from which the latest
+ undo record was chosen */
+ uint16_t top_offset; /*!< offset of the latest undo record,
+ i.e., the topmost element in the undo
+ log if we think of it as a stack */
+ undo_no_t top_undo_no; /*!< undo number of the latest record
+ (IB_ID_MAX if the undo log is empty) */
+ buf_block_t* guess_block; /*!< guess for the buffer block where
+ the top page might reside */
+
+ /** @return whether the undo log is empty */
+ bool empty() const { return top_undo_no == IB_ID_MAX; }
+
+ /*-----------------------------*/
+ UT_LIST_NODE_T(trx_undo_t) undo_list;
+ /*!< undo log objects in the rollback
+ segment are chained into lists */
+};
+
+/** Cache a pointer to an undo record in a latched buffer pool page,
+parse the undo log record and store the record type, update vector
+and compiler information */
+class UndorecApplier
+{
+ /** Undo log block page id */
+ page_id_t page_id;
+ /** Pointer to within undo log record */
+ const trx_undo_rec_t *undo_rec;
+ /** Undo log record type */
+ byte type;
+ /** compiler information */
+ byte cmpl_info;
+ /** page_offset(undo_rec) of the start of undo_rec */
+ uint16_t offset;
+ /** Transaction id of the undo log */
+ const trx_id_t trx_id;
+ /** Update vector */
+ upd_t *update;
+ /** memory heap which can be used to build previous version of
+ the index record and its offsets */
+ mem_heap_t *heap;
+ /** mini-transaction for accessing B-tree pages */
+ mtr_t mtr;
+
+public:
+ UndorecApplier(page_id_t page_id, trx_id_t trx_id) :
+ page_id(page_id), trx_id(trx_id), heap(mem_heap_create(100))
+ {
+ }
+
+ /** Assign the next page id */
+ void assign_next(const page_id_t next_page_id)
+ {
+ page_id= next_page_id;
+ }
+
+ page_id_t get_page_id() const { return page_id; }
+
+ /** Handle the DML undo log and apply it on online indexes */
+ inline void apply_undo_rec(const trx_undo_rec_t *rec);
+
+ ~UndorecApplier()
+ {
+ mem_heap_free(heap);
+ }
+
+private:
+ /** Handle the insert undo log and apply it on online indexes
+ @param tuple row reference from undo log record
+ @param clust_index clustered index */
+ void log_insert(const dtuple_t &tuple, dict_index_t *clust_index);
+
+ /** Handle the update, delete undo log and apply it on online
+ indexes.
+ @param tuple row reference from undo log record
+ @param clust_index clustered index */
+ void log_update(const dtuple_t &tuple, dict_index_t *clust_index);
+
+ /** Check whether the given roll pointer is generated by
+ the current undo log record information stored.
+ @return true if roll pointer matches with current undo log info */
+ inline bool is_same(roll_ptr_t roll_ptr) const;
+
+ /** Clear the undo log record information */
+ void clear_undo_rec()
+ {
+ undo_rec= nullptr;
+ cmpl_info= 0;
+ type= 0;
+ update= nullptr;
+ mem_heap_empty(heap);
+ }
+
+ /** Get the correct version of the clustered index record that
+ was modified by the current undo log record. Because there could
+ be the multiple successive updates of the same record within the
+ same transaction.
+ @param tuple tuple contains primary key value
+ @param index clustered index
+ @param[out] clust_rec current clustered index record
+ @param offsets offsets points to the record
+ @return clustered index record which was changed by
+ the undo log record or nullptr when there is no clustered
+ index record changed by undo log record */
+ const rec_t* get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+ const rec_t **clust_rec, rec_offs **offsets);
+};
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define TRX_UNDO_PAGE_TYPE 0 /*!< unused; 0 (before MariaDB 10.3.1:
+ 1=TRX_UNDO_INSERT or
+ 2=TRX_UNDO_UPDATE) */
+#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log
+ records for the LATEST transaction
+ start on this page (remember that
+ in an update undo log, the first page
+ can contain several undo logs) */
+#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this
+ field contains the byte offset of the
+ first free byte on the page */
+#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain
+ of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE)
+ /*!< Size of the transaction undo
+ log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT (3 << (srv_page_size_shift - 2))
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header
+ on the segment header page, 0 if
+ none */
+#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which
+ the undo log segment occupies */
+#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE)
+ /*!< Base node for the list of pages in
+ the undo log segment; defined only on
+ the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+/** Transaction start identifier, or 0 if the undo log segment has been
+completely purged and trx_purge_free_segment() has started freeing it */
+#define TRX_UNDO_TRX_ID 0
+/** Transaction end identifier (if the log is in a history list),
+or 0 if the transaction has not been committed */
+#define TRX_UNDO_TRX_NO 8
+/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of
+surviving user records, this used to be called TRX_UNDO_DEL_MARKS.
+
+This field is redundant; it is only being read by some debug assertions.
+
+The value 1 indicates that purge needs to process the undo log segment.
+The value 0 indicates that all of it has been processed, and
+trx_purge_free_segment() has been invoked, so the log is not safe to access.
+
+Before MariaDB 10.3.1, a log segment may carry the value 0 even before
+trx_purge_free_segment() was called, for those undo log records for
+which purge would not result in removing delete-marked records. */
+#define TRX_UNDO_NEEDS_PURGE 16
+#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record
+ of this log on the header page; purge
+ may remove undo log record from the
+ log start, and therefore this is not
+ necessarily the same as this log
+ header end offset */
+#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes
+ X/Open XA transaction identification
+ XID */
+#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table
+ create, index create, or drop
+ transaction: in recovery
+ the transaction cannot be rolled back
+ in the usual way: a 'rollback' rather
+ means dropping the created or dropped
+ table, if it still exists */
+#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding
+ field is TRUE */
+#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header
+ on this page, 0 if none */
+#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log
+ header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history
+ list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+ /*!< Total size of the undo log header
+ with the XA XID */
+/* @} */
+
+#include "trx0undo.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
new file mode 100644
index 00000000..9f05989f
--- /dev/null
+++ b/storage/innobase/include/trx0undo.inl
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ bool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ uint32_t page_no, /*!< in: page number */
+ uint16_t offset) /*!< in: offset of the undo entry within page */
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+
+ return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS |
+ roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS |
+ roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset;
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ bool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ uint32_t* page_no, /*!< out: page number */
+ uint16_t* offset) /*!< out: offset of the undo
+ entry within page */
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(roll_ptr < (1ULL << 56));
+ *offset= static_cast<uint16_t>(roll_ptr);
+ *page_no= static_cast<uint32_t>(roll_ptr >> 16);
+ *rseg_id= static_cast<ulint>(roll_ptr >> 48 & 0x7F);
+ *is_insert= static_cast<bool>(roll_ptr >> 55);
+}
+
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr) /*!< in: roll pointer */
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1)));
+ return static_cast<bool>(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS);
+}
+
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+{
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+ return bool(trx_id[DATA_TRX_ID_LEN] >> 7);
+}
+
+/** Determine the end offset of undo log records of an undo log page.
+@param[in] undo_page undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset
+@return end offset */
+inline
+uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
+ uint16_t offset)
+{
+ if (page_no == undo_page->page.id().page_no())
+ if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
+ undo_page->page.frame))
+ return end;
+
+ return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+ undo_page->page.frame);
+}
+
+/** Get the next record in an undo log.
+@param[in] undo_page undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+ uint32_t page_no, uint16_t offset)
+{
+ uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
+ uint16_t next= mach_read_from_2(undo_page->page.frame + rec);
+ return next == end ? nullptr : undo_page->page.frame + next;
+}
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
new file mode 100644
index 00000000..cb5d67cf
--- /dev/null
+++ b/storage/innobase/include/trx0xa.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef XA_H
+#define XA_H
+
+#include "handler.h"
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define XIDDATASIZE 128 /*!< maximum size of a transaction
+ identifier, in bytes */
+#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */
+#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */
+
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define XA_OK 0 /*!< normal execution */
+#define XAER_ASYNC -2 /*!< asynchronous operation already
+ outstanding */
+#define XAER_RMERR -3 /*!< a resource manager error
+ occurred in the transaction
+ branch */
+#define XAER_NOTA -4 /*!< the XID is not valid */
+#define XAER_INVAL -5 /*!< invalid arguments were given */
+#define XAER_PROTO -6 /*!< routine invoked in an improper
+ context */
+#define XAER_RMFAIL -7 /*!< resource manager unavailable */
+#define XAER_DUPID -8 /*!< the XID already exists */
+#define XAER_OUTSIDE -9 /*!< resource manager doing
+ work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
new file mode 100644
index 00000000..1b4f70b6
--- /dev/null
+++ b/storage/innobase/include/univ.i
@@ -0,0 +1,503 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#pragma once
+
+/** How far ahead should we tell the service manager the timeout
+(time in seconds) */
+#define INNODB_EXTEND_TIMEOUT_INTERVAL 30
+
+#if defined(_WIN32)
+# include <windows.h>
+#endif /* _WIN32 */
+
+/* Include a minimum number of SQL header files so that few changes
+made in SQL code cause a complete InnoDB rebuild. These headers are
+used throughout InnoDB but do not include too much themselves. They
+support cross-platform development and expose comonly used SQL names. */
+
+#include <my_global.h>
+#include "my_counter.h"
+#include "aligned.h"
+#include <m_string.h>
+#include <mysqld_error.h>
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
+#include <sys/stat.h>
+
+#ifndef _WIN32
+# include <sched.h>
+# include "my_config.h"
+#endif
+
+#include <stdint.h>
+#include <inttypes.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "my_pthread.h"
+
+/* Following defines are to enable performance schema
+instrumentation in each of five InnoDB modules if
+HAVE_PSI_INTERFACE is defined. */
+#ifdef HAVE_PSI_INTERFACE
+# define UNIV_PFS_MUTEX
+# define UNIV_PFS_RWLOCK
+# define UNIV_PFS_IO
+# define UNIV_PFS_THREAD
+
+# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */
+# ifdef HAVE_PSI_MEMORY_INTERFACE
+# define UNIV_PFS_MEMORY
+# endif /* HAVE_PSI_MEMORY_INTERFACE */
+
+#ifdef HAVE_PFS_THREAD_PROVIDER_H
+/* For PSI_MUTEX_CALL() and similar. */
+#include "pfs_thread_provider.h"
+#endif
+
+#include "mysql/psi/mysql_thread.h"
+/* For PSI_FILE_CALL(). */
+#ifdef HAVE_PFS_FILE_PROVIDER_H
+#include "pfs_file_provider.h"
+#endif
+
+#include "mysql/psi/mysql_file.h"
+
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifdef _WIN32
+# define YY_NO_UNISTD_H 1
+/* VC++ tries to optimise for size by default, from V8+. The size of
+the pointer to member depends on whether the type is defined before the
+compiler sees the type in the translation unit. This default behaviour
+can cause the pointer to be a different size in different translation
+units, depending on the above rule. We force optimise for size behaviour
+for all cases. This is used by ut0lst.h related code. */
+# pragma pointers_to_members(full_generality, multiple_inheritance)
+#endif /* _WIN32 */
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions can be called from the end of
+innodb_init() or they can be called from gdb after srv_start() has executed
+using the call command. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+#define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+#define UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define UNIV_ENABLE_UNIT_TEST_DICT_STATS
+#define UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+*/
+
+#ifdef DBUG_OFF
+# undef UNIV_DEBUG
+#elif !defined UNIV_DEBUG
+# define UNIV_DEBUG
+#endif
+
+#if 0
+#define UNIV_DEBUG_PRINT /* Enable the compilation of
+ some debug print functions */
+#define UNIV_AHI_DEBUG /* Enable adaptive hash index
+ debugging without UNIV_DEBUG */
+#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column
+ debugging without UNIV_DEBUG */
+#define UNIV_DEBUG_LOCK_VALIDATE /* Enable
+ ut_ad(lock_rec_validate_page())
+ assertions. */
+#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG /* debug HASH_ macros */
+#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_PERF_DEBUG /* debug flag that enables
+ light weight performance
+ related stuff. */
+#define UNIV_SEARCH_PERF_STAT /* statistics for the
+ adaptive hash index */
+#define UNIV_BTR_PRINT /* enable functions for
+ printing B-trees */
+#define UNIV_ZIP_DEBUG /* extensive consistency checks
+ for compressed pages */
+#define UNIV_ZIP_COPY /* call page_zip_copy_recs()
+ more often */
+#define UNIV_AIO_DEBUG /* prints info about
+ submitted and reaped AIO
+ requests to the log. */
+#define UNIV_STATS_DEBUG /* prints various stats
+ related debug info from
+ dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT /* FTS internal debugging
+ info output */
+#endif
+
+// #define UNIV_SQL_DEBUG
+
+#ifndef MY_ATTRIBUTE
+#if defined(__GNUC__)
+# define MY_ATTRIBUTE(A) __attribute__(A)
+#else
+# define MY_ATTRIBUTE(A)
+#endif
+#endif
+
+#define UNIV_INLINE static inline
+
+#define UNIV_WORD_SIZE SIZEOF_SIZE_T
+
+/** The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8U
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+#if defined (HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
+#define IF_PUNCH_HOLE(A,B) A
+#else
+#define IF_PUNCH_HOLE(A,B) B
+#endif
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN 10U
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+*/
+#define UNIV_ZIP_SIZE_SHIFT_MAX 14U
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN 12U
+/** log2 of largest page size (1<<16 == 64436 bytes). */
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX 16U
+/** log2 of default page size (1<<14 == 16384 bytes). */
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF 14U
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG 14U
+/** Original 16k InnoDB Page Size as an ssize (log2 - 9) */
+#define UNIV_PAGE_SSIZE_ORIG (UNIV_PAGE_SIZE_SHIFT_ORIG - 9U)
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN (1U << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX (1U << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF (1U << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG (1U << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN (1U << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX (1U << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Largest possible ssize for an uncompressed page.
+(The convention 'ssize' is used for 'log2 minus 9' or the number of
+shifts starting with 512.)
+This max number varies depending on srv_page_size. */
+#define UNIV_PAGE_SSIZE_MAX \
+ ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Smallest possible ssize for an uncompressed page. */
+#define UNIV_PAGE_SSIZE_MIN \
+ ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/** This is the "mbmaxlen" for my_charset_filename (defined in
+strings/ctype-utf8.c), which is used to encode File and Database names. */
+#define FILENAME_CHARSET_MAXNAMLEN 5
+
+/** The maximum length of an encode table name in bytes. The max
+table and database names are NAME_CHAR_LEN (64) characters. After the
+encoding, the max length would be NAME_CHAR_LEN (64) *
+FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
+terminating '\0'. InnoDB can handle longer names internally */
+#define MAX_TABLE_NAME_LEN 320
+
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+the MySQL's NAME_LEN, see check_and_convert_db_name(). */
+#define MAX_DATABASE_NAME_LEN MAX_TABLE_NAME_LEN
+
+/** MAX_FULL_NAME_LEN defines the full name path including the
+database name and table name. In addition, 14 bytes is added for:
+ 2 for surrounding quotes around table name
+ 1 for the separating dot (.)
+ 9 for the #mysql50# prefix */
+#define MAX_FULL_NAME_LEN \
+ (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14)
+
+/** Maximum length of the compression alogrithm string. Currently we support
+only (NONE | ZLIB | LZ4). */
+#define MAX_COMPRESSION_LEN 4
+
+/** The maximum length in bytes that a database name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_DB_UTF8_LEN (NAME_LEN + 1)
+
+/** The maximum length in bytes that a table name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix))
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+/** Unsigned octet of bits */
+typedef unsigned char byte;
+/** Machine-word-width unsigned integer */
+typedef size_t ulint;
+/** Machine-word-width signed integer */
+typedef ssize_t lint;
+
+/** ulint format for the printf() family of functions */
+#define ULINTPF "%zu"
+/** ulint hexadecimal format for the printf() family of functions */
+#define ULINTPFx "%zx"
+
+#ifdef _WIN32
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF "%u"
+# define UINT64scan "llu"
+# define UINT64PFx "%016llx"
+#elif defined __APPLE__
+/* Apple prefers to call the 64-bit types 'long long'
+in both 32-bit and 64-bit environments. */
+# define UINT32PF "%" PRIu32
+# define UINT64scan "llu"
+# define UINT64PFx "%016llx"
+#elif defined _AIX
+/* Workaround for macros expension trouble */
+# define UINT32PF "%u"
+# define UINT64scan "lu"
+# define UINT64PFx "%016lx"
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF "%" PRIu32
+# define INT64PF "%" PRId64
+# define UINT64scan PRIu64
+# define UINT64PFx "%016" PRIx64
+#endif
+
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+
+#define UINT64PF "%" UINT64scan
+#define IB_ID_FMT UINT64PF
+
+/** Log sequence number (also used for redo log byte arithmetics) */
+typedef ib_uint64_t lsn_t;
+
+/** The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED ((ib_uint64_t)(-1))
+
+/** The bitmask of 32-bit unsigned integer */
+#define ULINT32_MASK 0xFFFFFFFFU
+/** The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED ULINT32_MASK
+
+/** Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+/** Maximum value for ib_uint64_t */
+#define IB_UINT64_MAX ((ib_uint64_t) (~0ULL))
+
+/** The generic InnoDB system object identifier data type */
+typedef ib_uint64_t ib_id_t;
+#define IB_ID_MAX (~(ib_id_t) 0)
+#define IB_ID_FMT UINT64PF
+
+#ifndef UINTMAX_MAX
+#define UINTMAX_MAX IB_UINT64_MAX
+#endif
+/** This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool ulint
+
+#ifndef TRUE
+
+#define TRUE 1
+#define FALSE 0
+
+#endif
+
+#define UNIV_NOTHROW
+
+/** The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_DEF)
+
+#if defined(__GNUC__)
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED MY_ATTRIBUTE ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#ifdef __GNUC__
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ptr) != 0, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+
+# include <sun_prefetch.h>
+
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+
+# elif defined _MSC_VER
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# if defined _M_IX86 || defined _M_X64
+ // __MM_HINT_T0 - (temporal data)
+ // prefetch data into all levels of the cache hierarchy.
+# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# elif defined _M_ARM64
+# define UNIV_PREFETCH_R(addr) __prefetch(addr)
+# define UNIV_PREFETCH_RW(addr) __prefetch(addr)
+# else
+# define UNIV_PREFETCH_R ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+#include <stdio.h>
+#include "db0err.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+#include "ut0ut.h"
+
+extern uint32_t srv_page_size_shift;
+extern ulong srv_page_size;
+
+/* Dimension of spatial object we support so far. It has its root in
+myisam/sp_defs.h. We only support 2 dimension data */
+#define SPDIMS 2
+
+#ifdef HAVE_PSI_INTERFACE
+typedef unsigned int mysql_pfs_key_t;
+
+# ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t fts_cache_mutex_key;
+extern mysql_pfs_key_t fts_cache_init_mutex_key;
+extern mysql_pfs_key_t fts_delete_mutex_key;
+extern mysql_pfs_key_t fts_doc_id_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+extern mysql_pfs_key_t purge_sys_pq_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rtr_active_mutex_key;
+extern mysql_pfs_key_t rtr_match_mutex_key;
+extern mysql_pfs_key_t rtr_path_mutex_key;
+extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t buf_dblwr_mutex_key;
+extern mysql_pfs_key_t trx_pool_mutex_key;
+extern mysql_pfs_key_t trx_pool_manager_mutex_key;
+extern mysql_pfs_key_t lock_wait_mutex_key;
+extern mysql_pfs_key_t srv_threads_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+extern mysql_pfs_key_t index_online_log_key;
+extern mysql_pfs_key_t trx_sys_rw_lock_key;
+extern mysql_pfs_key_t lock_latch_key;
+extern mysql_pfs_key_t log_latch_key;
+extern mysql_pfs_key_t trx_rseg_latch_key;
+# endif /* UNIV_PFS_RWLOCK */
+#endif /* HAVE_PSI_INTERFACE */
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
new file mode 100644
index 00000000..2b70fac3
--- /dev/null
+++ b/storage/innobase/include/ut0byte.h
@@ -0,0 +1,107 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+#include "univ.i"
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low) /*!< in: low-order 32 bits */
+ MY_ATTRIBUTE((const));
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/** Round down a pointer to the nearest aligned address.
+@param ptr pointer
+@param alignment a power of 2
+@return aligned pointer */
+static inline void *ut_align_down(void *ptr, size_t alignment)
+{
+ ut_ad(alignment > 0);
+ ut_ad(ut_is_2pow(alignment));
+ ut_ad(ptr);
+ static_assert(sizeof ptr == sizeof(size_t), "compatibility");
+
+ return reinterpret_cast<void*>(reinterpret_cast<size_t>(ptr) &
+ ~(alignment - 1));
+}
+
+static inline const void *ut_align_down(const void *ptr, size_t alignment)
+{
+ return ut_align_down(const_cast<void*>(ptr), alignment);
+}
+
+/** Compute the offset of a pointer from the nearest aligned address.
+@param ptr pointer
+@param alignment a power of 2
+@return distance from aligned pointer */
+inline size_t ut_align_offset(const void *ptr, size_t alignment)
+{
+ ut_ad(alignment > 0);
+ ut_ad(ut_is_2pow(alignment));
+ ut_ad(ptr);
+ return reinterpret_cast<size_t>(ptr) & (alignment - 1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n); /*!< in: nth bit requested */
+
+#include "ut0byte.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0byte.inl b/storage/innobase/include/ut0byte.inl
new file mode 100644
index 00000000..dfa069c2
--- /dev/null
+++ b/storage/innobase/include/ut0byte.inl
@@ -0,0 +1,90 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low) /*!< in: low-order 32 bits */
+{
+ ut_ad(high <= ULINT32_MASK);
+ ut_ad(low <= ULINT32_MASK);
+ return(((ib_uint64_t) high) << 32 | low);
+}
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ib_uint64_t align_1 = (ib_uint64_t) align_no - 1;
+
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return((n + align_1) & ~align_1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n) /*!< in: nth bit requested */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+ return(1 & (a >> n));
+}
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
new file mode 100644
index 00000000..d6589cc4
--- /dev/null
+++ b/storage/innobase/include/ut0counter.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0counter.h
+
+Counter utility class
+
+Created 2012/04/12 by Sunny Bains
+*******************************************************/
+
+#ifndef ut0counter_h
+#define ut0counter_h
+
+#include "univ.i"
+#include "my_rdtsc.h"
+
+/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles
+as a random value. See the comments for my_timer_cycles() */
+/** @return result from RDTSC or similar functions. */
+static inline size_t
+get_rnd_value()
+{
+ size_t c = static_cast<size_t>(my_timer_cycles());
+
+ if (c != 0) {
+ return c;
+ }
+
+ /* We may go here if my_timer_cycles() returns 0,
+ so we have to have the plan B for the counter. */
+#if !defined(_WIN32)
+ return (size_t)pthread_self();
+#else
+ LARGE_INTEGER cnt;
+ QueryPerformanceCounter(&cnt);
+
+ return static_cast<size_t>(cnt.QuadPart);
+#endif /* !_WIN32 */
+}
+
+/** Atomic which occupies whole CPU cache line.
+Note: We rely on the default constructor of std::atomic and
+do not explicitly initialize the contents. This works for us,
+because ib_counter_t is only intended for usage with global
+memory that is allocated from the .bss and thus guaranteed to
+be zero-initialized by the run-time environment.
+@see srv_stats */
+template <typename Type>
+struct ib_atomic_counter_element_t {
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_relaxed<Type> value;
+};
+
+template <typename Type>
+struct ib_counter_element_t {
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Type value;
+};
+
+
+/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic
+so the results are not guaranteed to be 100% accurate but close
+enough. */
+template <typename Type,
+ template <typename T> class Element = ib_atomic_counter_element_t,
+ int N = 128 >
+struct ib_counter_t {
+ /** Increment the counter by 1. */
+ void inc() { add(1); }
+ ib_counter_t& operator++() { inc(); return *this; }
+
+ /** Increment the counter by 1.
+ @param[in] index a reasonably thread-unique identifier */
+ void inc(size_t index) { add(index, 1); }
+
+ /** Add to the counter.
+ @param[in] n amount to be added */
+ void add(Type n) { add(get_rnd_value(), n); }
+
+ /** Add to the counter.
+ @param[in] index a reasonably thread-unique identifier
+ @param[in] n amount to be added */
+ TPOOL_SUPPRESS_TSAN void add(size_t index, Type n) {
+ index = index % N;
+
+ ut_ad(index < UT_ARR_SIZE(m_counter));
+
+ m_counter[index].value += n;
+ }
+
+ /* @return total value - not 100% accurate, since it is relaxed atomic*/
+ operator Type() const {
+ Type total = 0;
+
+ for (const auto &counter : m_counter) {
+ total += counter.value;
+ }
+
+ return(total);
+ }
+
+private:
+ static_assert(sizeof(Element<Type>) == CPU_LEVEL1_DCACHE_LINESIZE, "");
+ /** Array of counter elements */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Element<Type> m_counter[N];
+};
+
+#endif /* ut0counter_h */
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
new file mode 100644
index 00000000..85856660
--- /dev/null
+++ b/storage/innobase/include/ut0dbg.h
@@ -0,0 +1,179 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a assert
+#define ut_ad assert
+#define ut_error assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
+/* Do not include univ.i because univ.i includes this. */
+
+/*************************************************************//**
+Report a failed assertion. */
+ATTRIBUTE_NORETURN ATTRIBUTE_COLD __attribute__((nonnull(2)))
+void
+ut_dbg_assertion_failed(
+/*====================*/
+ const char* expr, /*!< in: the failed assertion */
+ const char* file, /*!< in: source file containing the assertion */
+ unsigned line); /*!< in: line number of the assertion */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR assertion expression that should hold */
+#define ut_a(EXPR) do { \
+ if (UNIV_UNLIKELY(!(ulint) (EXPR))) { \
+ ut_dbg_assertion_failed(#EXPR, \
+ __FILE__, __LINE__); \
+ } \
+} while (0)
+
+/** Abort execution. */
+#define ut_error \
+ ut_dbg_assertion_failed(0, __FILE__, __LINE__)
+
+/** Debug assertion */
+#define ut_ad DBUG_SLOW_ASSERT
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR) EXPR
+#else
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+#if defined(HAVE_SYS_TIME_H) && defined(HAVE_SYS_RESOURCE_H)
+
+#define HAVE_UT_CHRONO_T
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** A "chronometer" used to clock snippets of code.
+Example usage:
+ ut_chrono_t ch("this loop");
+ for (;;) { ... }
+ ch.show();
+would print the timings of the for() loop, prefixed with "this loop:" */
+class ut_chrono_t {
+public:
+ /** Constructor.
+ @param[in] name chrono's name, used when showing the values */
+ ut_chrono_t(
+ const char* name)
+ :
+ m_name(name),
+ m_show_from_destructor(true)
+ {
+ reset();
+ }
+
+ /** Resets the chrono (records the current time in it). */
+ void
+ reset()
+ {
+ gettimeofday(&m_tv, NULL);
+
+ getrusage(RUSAGE_SELF, &m_ru);
+ }
+
+ /** Shows the time elapsed and usage statistics since the last reset. */
+ void
+ show()
+ {
+ struct rusage ru_now;
+ struct timeval tv_now;
+ struct timeval tv_diff;
+
+ getrusage(RUSAGE_SELF, &ru_now);
+
+ gettimeofday(&tv_now, NULL);
+
+#ifndef timersub
+#define timersub(a, b, r) \
+ do { \
+ (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
+ (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+ if ((r)->tv_usec < 0) { \
+ (r)->tv_sec--; \
+ (r)->tv_usec += 1000000; \
+ } \
+ } while (0)
+#endif /* timersub */
+
+#define CHRONO_PRINT(type, tvp) \
+ fprintf(stderr, "%s: %s% 5ld.%06ld sec\n", \
+ m_name, type, \
+ static_cast<long>((tvp)->tv_sec), \
+ static_cast<long>((tvp)->tv_usec))
+
+ timersub(&tv_now, &m_tv, &tv_diff);
+ CHRONO_PRINT("real", &tv_diff);
+
+ timersub(&ru_now.ru_utime, &m_ru.ru_utime, &tv_diff);
+ CHRONO_PRINT("user", &tv_diff);
+
+ timersub(&ru_now.ru_stime, &m_ru.ru_stime, &tv_diff);
+ CHRONO_PRINT("sys ", &tv_diff);
+ }
+
+ /** Cause the timings not to be printed from the destructor. */
+ void end()
+ {
+ m_show_from_destructor = false;
+ }
+
+ /** Destructor. */
+ ~ut_chrono_t()
+ {
+ if (m_show_from_destructor) {
+ show();
+ }
+ }
+
+private:
+ /** Name of this chronometer. */
+ const char* m_name;
+
+ /** True if the current timings should be printed by the destructor. */
+ bool m_show_from_destructor;
+
+ /** getrusage() result as of the last reset(). */
+ struct rusage m_ru;
+
+ /** gettimeofday() result as of the last reset(). */
+ struct timeval m_tv;
+};
+
+#endif /* HAVE_SYS_TIME_H && HAVE_SYS_RESOURCE_H */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
new file mode 100644
index 00000000..765f6a2a
--- /dev/null
+++ b/storage/innobase/include/ut0list.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+struct ib_list_t;
+struct ib_list_node_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return list */
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+/****************************************************************//**
+Free a list. */
+void
+ib_list_free(
+/*=========*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+void
+ib_list_remove(
+/*===========*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* node); /*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list); /*!< in: list */
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+ /* out: TRUE if empty else */
+ const ib_list_t* list); /* in: list */
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+ const ib_list_t* list); /*<! in: list */
+
+/* List. */
+struct ib_list_t {
+ ib_list_node_t* first; /*!< first node */
+ ib_list_node_t* last; /*!< last node */
+};
+
+/* A list node. */
+struct ib_list_node_t {
+ ib_list_node_t* prev; /*!< previous node */
+ ib_list_node_t* next; /*!< next node */
+ void* data; /*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_t {
+ mem_heap_t* heap; /*!< memory heap */
+ void* data; /*!< user data */
+};
+
+#include "ut0list.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0list.inl b/storage/innobase/include/ut0list.inl
new file mode 100644
index 00000000..3bdba52b
--- /dev/null
+++ b/storage/innobase/include/ut0list.inl
@@ -0,0 +1,80 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->last);
+}
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+ /* out: TRUE if empty else FALSE */
+ const ib_list_t* list) /* in: list */
+{
+ return(!(list->first || list->last));
+}
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+ const ib_list_t* list) /*<! in: list */
+{
+ ulint len = 0;
+ ib_list_node_t* node = list->first;
+
+ while(node) {
+ len++;
+ node = node->next;
+ }
+
+ return (len);
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
new file mode 100644
index 00000000..7b7ed7b8
--- /dev/null
+++ b/storage/innobase/include/ut0lst.h
@@ -0,0 +1,563 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+Rewritten by Sunny Bains Dec 2011.
+***********************************************************************/
+
+#pragma once
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include "ut0dbg.h"
+
+/* This module implements the two-way linear list. Note that a single
+list node may belong to two or more lists, but is only on one list
+at a time. */
+
+/*******************************************************************//**
+The two way list node.
+@param TYPE the list node type name */
+template <typename Type>
+struct ut_list_node {
+ Type* prev; /*!< pointer to the previous
+ node, NULL if start of list */
+ Type* next; /*!< pointer to next node,
+ NULL if end of list */
+
+ void reverse()
+ {
+ Type* tmp = prev;
+ prev = next;
+ next = tmp;
+ }
+};
+
+/** Macro used for legacy reasons */
+#define UT_LIST_NODE_T(t) ut_list_node<t>
+
+/*******************************************************************//**
+The two-way list base node. The base node contains pointers to both ends
+of the list and a count of nodes in the list (excluding the base node
+from the count). We also store a pointer to the member field so that it
+doesn't have to be specified when doing list operations.
+@param Type the type of the list element
+@param NodePtr field member pointer that points to the list node */
+template <typename Type, typename NodePtr>
+struct ut_list_base {
+ typedef Type elem_type;
+ typedef NodePtr node_ptr;
+ typedef ut_list_node<Type> node_type;
+
+ ulint count; /*!< count of nodes in list */
+ elem_type* start; /*!< pointer to list start,
+ NULL if empty */
+ elem_type* end; /*!< pointer to list end,
+ NULL if empty */
+ node_ptr node; /*!< Pointer to member field
+ that is used as a link node */
+#ifdef UNIV_DEBUG
+ ulint init; /*!< UT_LIST_INITIALISED if
+ the list was initialised with
+ UT_LIST_INIT() */
+#endif /* UNIV_DEBUG */
+
+ void reverse()
+ {
+ Type* tmp = start;
+ start = end;
+ end = tmp;
+ }
+};
+
+#define UT_LIST_BASE_NODE_T(t) ut_list_base<t, ut_list_node<t> t::*>
+
+#ifdef UNIV_DEBUG
+# define UT_LIST_INITIALISED 0xCAFE
+# define UT_LIST_INITIALISE(b) (b).init = UT_LIST_INITIALISED
+# define UT_LIST_IS_INITIALISED(b) ut_a(((b).init == UT_LIST_INITIALISED))
+#else
+# define UT_LIST_INITIALISE(b)
+# define UT_LIST_IS_INITIALISED(b)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Note: This is really the list constructor. We should be able to use
+placement new here.
+Initializes the base node of a two-way list.
+@param b the list base node
+@param pmf point to member field that will be used as the link node */
+#define UT_LIST_INIT(b, pmf) \
+{ \
+ (b).count = 0; \
+ (b).start = 0; \
+ (b).end = 0; \
+ (b).node = pmf; \
+ UT_LIST_INITIALISE(b); \
+}
+
+/** Functor for accessing the embedded node within a list element. This is
+required because some lists can have the node emebedded inside a nested
+struct/union. See lock0priv.h (table locks) for an example. It provides a
+specialised functor to grant access to the list node. */
+template <typename Type>
+struct GenericGetNode {
+
+ typedef ut_list_node<Type> node_type;
+
+ GenericGetNode(node_type Type::* node) : m_node(node) {}
+
+ node_type& operator() (Type& elem)
+ {
+ return(elem.*m_node);
+ }
+
+ node_type Type::*m_node;
+};
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem the element to add */
+template <typename List>
+void
+ut_list_prepend(
+ List& list,
+ typename List::elem_type* elem)
+{
+ typename List::node_type& elem_node = elem->*list.node;
+
+ UT_LIST_IS_INITIALISED(list);
+
+ elem_node.prev = 0;
+ elem_node.next = list.start;
+
+ if (list.start != 0) {
+ typename List::node_type& base_node =
+ list.start->*list.node;
+
+ ut_ad(list.start != elem);
+
+ base_node.prev = elem;
+ }
+
+ list.start = elem;
+
+ if (list.end == 0) {
+ list.end = elem;
+ }
+
+ ++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_FIRST(LIST, ELEM) ut_list_prepend(LIST, ELEM)
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add
+@param get_node to get the list node for that element */
+template <typename List, typename Functor>
+void
+ut_list_append(
+ List& list,
+ typename List::elem_type* elem,
+ Functor get_node)
+{
+ typename List::node_type& node = get_node(*elem);
+
+ UT_LIST_IS_INITIALISED(list);
+
+ node.next = 0;
+ node.prev = list.end;
+
+ if (list.end != 0) {
+ typename List::node_type& base_node = get_node(*list.end);
+
+ ut_ad(list.end != elem);
+
+ base_node.next = elem;
+ }
+
+ list.end = elem;
+
+ if (list.start == 0) {
+ list.start = elem;
+ }
+
+ ++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add */
+template <typename List>
+void
+ut_list_append(
+ List& list,
+ typename List::elem_type* elem)
+{
+ ut_list_append(
+ list, elem,
+ GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param LIST list base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_LAST(LIST, ELEM) ut_list_append(LIST, ELEM)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1 */
+template <typename List>
+void
+ut_list_insert(
+ List& list,
+ typename List::elem_type* elem1,
+ typename List::elem_type* elem2)
+{
+ ut_ad(elem1 != elem2);
+ UT_LIST_IS_INITIALISED(list);
+
+ typename List::node_type& elem1_node = elem1->*list.node;
+ typename List::node_type& elem2_node = elem2->*list.node;
+
+ elem2_node.prev = elem1;
+ elem2_node.next = elem1_node.next;
+
+ if (elem1_node.next != NULL) {
+ typename List::node_type& next_node =
+ elem1_node.next->*list.node;
+
+ next_node.prev = elem2;
+ }
+
+ elem1_node.next = elem2;
+
+ if (list.end == elem1) {
+ list.end = elem2;
+ }
+
+ ++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param LIST list base node (not a pointer to it)
+@param ELEM1 node after which ELEM2 is inserted
+@param ELEM2 node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(LIST, ELEM1, ELEM2) \
+ ut_list_insert(LIST, ELEM1, ELEM2)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1
+@param get_node to get the list node for that element */
+
+template <typename List, typename Functor>
+void
+ut_list_insert(
+ List& list,
+ typename List::elem_type* elem1,
+ typename List::elem_type* elem2,
+ Functor get_node)
+{
+ ut_ad(elem1 != elem2);
+ UT_LIST_IS_INITIALISED(list);
+
+ typename List::node_type& elem1_node = get_node(*elem1);
+ typename List::node_type& elem2_node = get_node(*elem2);
+
+ elem2_node.prev = elem1;
+ elem2_node.next = elem1_node.next;
+
+ if (elem1_node.next != NULL) {
+ typename List::node_type& next_node =
+ get_node(*elem1_node.next);
+
+ next_node.prev = elem2;
+ }
+
+ elem1_node.next = elem2;
+
+ if (list.end == elem1) {
+ list.end = elem2;
+ }
+
+ ++list.count;
+
+}
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param node member node within list element that is to be removed
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+ List& list,
+ typename List::node_type& node,
+ Functor get_node)
+{
+ ut_a(list.count > 0);
+ UT_LIST_IS_INITIALISED(list);
+
+ if (node.next != NULL) {
+ typename List::node_type& next_node =
+ get_node(*node.next);
+
+ next_node.prev = node.prev;
+ } else {
+ list.end = node.prev;
+ }
+
+ if (node.prev != NULL) {
+ typename List::node_type& prev_node =
+ get_node(*node.prev);
+
+ prev_node.next = node.next;
+ } else {
+ list.start = node.next;
+ }
+
+ node.next = 0;
+ node.prev = 0;
+
+ --list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+ List& list,
+ typename List::elem_type* elem,
+ Functor get_node)
+{
+ ut_list_remove(list, get_node(*elem), get_node);
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list */
+template <typename List>
+void
+ut_list_remove(
+ List& list,
+ typename List::elem_type* elem)
+{
+ ut_list_remove(
+ list, elem->*list.node,
+ GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM node to be removed from the list */
+#define UT_LIST_REMOVE(LIST, ELEM) ut_list_remove(LIST, ELEM)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N) (((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N) (((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE the base node (not a pointer to it).
+@return the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE) (BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE) (BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE) (BASE).end
+
+struct NullValidate { void operator()(const void*) const {} };
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in] list base node (not a pointer to it)
+@param[in,out] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, Functor& functor)
+{
+ ulint count = 0;
+
+ UT_LIST_IS_INITIALISED(list);
+
+ for (typename List::elem_type* elem = list.start; elem;
+ elem = (elem->*list.node).next, ++count) {
+
+ functor(elem);
+ }
+
+ ut_a(count == list.count);
+}
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in] list base node (not a pointer to it)
+@param[in] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, const Functor& functor)
+{
+ ulint count = 0;
+
+ UT_LIST_IS_INITIALISED(list);
+
+ for (typename List::elem_type* elem = list.start; elem;
+ elem = (elem->*list.node).next, ++count) {
+
+ functor(elem);
+ }
+
+ ut_a(count == list.count);
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list base node (not a pointer to it)
+@param[in,out] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+void ut_list_validate(const List& list, Functor& functor)
+{
+ ut_list_map(list, functor);
+#ifdef UNIV_DEBUG
+ /* Validate the list backwards. */
+ ulint count = list.count;
+
+ for (typename List::elem_type* elem = list.end;
+ elem != 0;
+ elem = (elem->*list.node).prev) {
+ --count;
+ }
+ ut_ad(!count);
+#endif
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list base node (not a pointer to it)
+@param[in] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_validate(const List& list, const Functor& functor)
+{
+ ut_list_map(list, functor);
+#ifdef UNIV_DEBUG
+ /* Validate the list backwards. */
+ ulint count = list.count;
+
+ for (typename List::elem_type* elem = list.end;
+ elem != 0;
+ elem = (elem->*list.node).prev) {
+ --count;
+ }
+
+ ut_ad(!count);
+#endif
+}
+
+template <typename List>
+inline void ut_list_validate(const List& list)
+{
+ ut_d(ut_list_validate(list, NullValidate()));
+}
+
+#ifdef UNIV_DEBUG
+template <typename List>
+inline void ut_list_reverse(List& list)
+{
+ UT_LIST_IS_INITIALISED(list);
+
+ for (typename List::elem_type* elem = list.start;
+ elem != 0;
+ elem = (elem->*list.node).prev) {
+ (elem->*list.node).reverse();
+ }
+
+ list.reverse();
+}
+
+/** Check if the given element exists in the list.
+@param[in,out] list the list object
+@param[in] elem the element of the list which will be checked */
+template <typename List>
+inline bool ut_list_exists(const List& list, typename List::elem_type* elem)
+{
+ for (typename List::elem_type* e1 = UT_LIST_GET_FIRST(list); e1;
+ e1 = (e1->*list.node).next) {
+ if (elem == e1) {
+ return true;
+ }
+ }
+ return false;
+}
+#endif
+
+/** Move the given element to the beginning of the list.
+@param[in,out] list the list object
+@param[in] elem the element of the list which will be moved
+ to the beginning of the list. */
+template <typename List>
+void
+ut_list_move_to_front(
+ List& list,
+ typename List::elem_type* elem)
+{
+ ut_ad(ut_list_exists(list, elem));
+
+ if (UT_LIST_GET_FIRST(list) != elem) {
+ ut_list_remove(list, elem);
+ ut_list_prepend(list, elem);
+ }
+}
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
new file mode 100644
index 00000000..a5ed72f9
--- /dev/null
+++ b/storage/innobase/include/ut0mem.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+
+/********************************************************************
+Concatenate 3 strings.*/
+char*
+ut_str3cat(
+/*=======*/
+ /* out, own: concatenated string, must be
+ freed with ut_free() */
+ const char* s1, /* in: string 1 */
+ const char* s2, /* in: string 2 */
+ const char* s3); /* in: string 3 */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size); /*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+#include "ut0mem.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0mem.inl b/storage/innobase/include/ut0mem.inl
new file mode 100644
index 00000000..cc95a036
--- /dev/null
+++ b/storage/innobase/include/ut0mem.inl
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size) /*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u) ((char) ((u) >> 8))
+#define UINT16_GET_B(u) ((char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u) ((char) ((u) & 0xFF))
+#define UINT16_GET_B(u) ((char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a) \
+ MK_UINT16(a, '0'), \
+ MK_UINT16(a, '1'), \
+ MK_UINT16(a, '2'), \
+ MK_UINT16(a, '3'), \
+ MK_UINT16(a, '4'), \
+ MK_UINT16(a, '5'), \
+ MK_UINT16(a, '6'), \
+ MK_UINT16(a, '7'), \
+ MK_UINT16(a, '8'), \
+ MK_UINT16(a, '9'), \
+ MK_UINT16(a, 'A'), \
+ MK_UINT16(a, 'B'), \
+ MK_UINT16(a, 'C'), \
+ MK_UINT16(a, 'D'), \
+ MK_UINT16(a, 'E'), \
+ MK_UINT16(a, 'F')
+
+ static const uint16 hex_map[256] = {
+ MK_ALL_UINT16_WITH_A('0'),
+ MK_ALL_UINT16_WITH_A('1'),
+ MK_ALL_UINT16_WITH_A('2'),
+ MK_ALL_UINT16_WITH_A('3'),
+ MK_ALL_UINT16_WITH_A('4'),
+ MK_ALL_UINT16_WITH_A('5'),
+ MK_ALL_UINT16_WITH_A('6'),
+ MK_ALL_UINT16_WITH_A('7'),
+ MK_ALL_UINT16_WITH_A('8'),
+ MK_ALL_UINT16_WITH_A('9'),
+ MK_ALL_UINT16_WITH_A('A'),
+ MK_ALL_UINT16_WITH_A('B'),
+ MK_ALL_UINT16_WITH_A('C'),
+ MK_ALL_UINT16_WITH_A('D'),
+ MK_ALL_UINT16_WITH_A('E'),
+ MK_ALL_UINT16_WITH_A('F')
+ };
+ const unsigned char* rawc;
+ ulint read_bytes;
+ ulint write_bytes;
+ ulint i;
+
+ rawc = (const unsigned char*) raw;
+
+ if (hex_size == 0) {
+
+ return(0);
+ }
+
+ if (hex_size <= 2 * raw_size) {
+
+ read_bytes = hex_size / 2;
+ write_bytes = hex_size;
+ } else {
+
+ read_bytes = raw_size;
+ write_bytes = 2 * raw_size + 1;
+ }
+
+#define LOOP_READ_BYTES(ASSIGN) \
+ for (i = 0; i < read_bytes; i++) { \
+ ASSIGN; \
+ hex += 2; \
+ rawc++; \
+ }
+
+ if (ut_align_offset(hex, 2) == 0) {
+
+ LOOP_READ_BYTES(
+ *(uint16*) hex = hex_map[*rawc]
+ );
+ } else {
+
+ LOOP_READ_BYTES(
+ *hex = UINT16_GET_A(hex_map[*rawc]);
+ *(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+ );
+ }
+
+ if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+ hex--;
+ }
+
+ *hex = '\0';
+
+ return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint str_i;
+ ulint buf_i;
+
+ buf_i = 0;
+
+ switch (buf_size) {
+ case 3:
+
+ if (str_len == 0) {
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\'';
+ buf_i++;
+ }
+ /* FALLTHROUGH */
+ case 2:
+ case 1:
+
+ buf[buf_i] = '\0';
+ buf_i++;
+ /* FALLTHROUGH */
+ case 0:
+
+ return(buf_i);
+ }
+
+ /* buf_size >= 4 */
+
+ buf[0] = '\'';
+ buf_i = 1;
+
+ for (str_i = 0; str_i < str_len; str_i++) {
+
+ char ch;
+
+ if (buf_size - buf_i == 2) {
+
+ break;
+ }
+
+ ch = str[str_i];
+
+ switch (ch) {
+ case '\0':
+
+ if (buf_size - buf_i < 4) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = '\\';
+ buf_i++;
+ buf[buf_i] = '0';
+ buf_i++;
+ break;
+ case '\'':
+ case '\\':
+
+ if (buf_size - buf_i < 4) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = ch;
+ buf_i++;
+ /* FALLTHROUGH */
+ default:
+
+ buf[buf_i] = ch;
+ buf_i++;
+ }
+ }
+
+func_exit:
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\0';
+ buf_i++;
+
+ return(buf_i);
+}
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
new file mode 100644
index 00000000..f4183e4c
--- /dev/null
+++ b/storage/innobase/include/ut0new.h
@@ -0,0 +1,1099 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0new.h
+Instrumented memory allocator.
+
+Created May 26, 2014 Vasil Dimov
+*******************************************************/
+
+/** Dynamic memory allocation within InnoDB guidelines.
+All dynamic (heap) memory allocations (malloc(3), strdup(3), etc, "new",
+various std:: containers that allocate memory internally), that are done
+within InnoDB are instrumented. This means that InnoDB uses a custom set
+of functions for allocating memory, rather than calling e.g. "new" directly.
+
+Here follows a cheat sheet on what InnoDB functions to use whenever a
+standard one would have been used.
+
+Creating new objects with "new":
+--------------------------------
+Standard:
+ new expression
+ or
+ new(std::nothrow) expression
+InnoDB, default instrumentation:
+ UT_NEW_NOKEY(expression)
+InnoDB, custom instrumentation, preferred:
+ UT_NEW(expression, key)
+
+Destroying objects, created with "new":
+---------------------------------------
+Standard:
+ delete ptr
+InnoDB:
+ UT_DELETE(ptr)
+
+Creating new arrays with "new[]":
+---------------------------------
+Standard:
+ new type[num]
+ or
+ new(std::nothrow) type[num]
+InnoDB, default instrumentation:
+ UT_NEW_ARRAY_NOKEY(type, num)
+InnoDB, custom instrumentation, preferred:
+ UT_NEW_ARRAY(type, num, key)
+
+Destroying arrays, created with "new[]":
+----------------------------------------
+Standard:
+ delete[] ptr
+InnoDB:
+ UT_DELETE_ARRAY(ptr)
+
+Declaring a type with a std:: container, e.g. std::vector:
+----------------------------------------------------------
+Standard:
+ std::vector<t>
+InnoDB:
+ std::vector<t, ut_allocator<t> >
+
+Declaring objects of some std:: type:
+-------------------------------------
+Standard:
+ std::vector<t> v
+InnoDB, default instrumentation:
+ std::vector<t, ut_allocator<t> > v
+InnoDB, custom instrumentation, preferred:
+ std::vector<t, ut_allocator<t> > v(ut_allocator<t>(key))
+
+Raw block allocation (as usual in C++, consider whether using "new" would
+not be more appropriate):
+-------------------------------------------------------------------------
+Standard:
+ malloc(num)
+InnoDB, default instrumentation:
+ ut_malloc_nokey(num)
+InnoDB, custom instrumentation, preferred:
+ ut_malloc(num, key)
+
+Raw block resize:
+-----------------
+Standard:
+ realloc(ptr, new_size)
+InnoDB:
+ ut_realloc(ptr, new_size)
+
+Raw block deallocation:
+-----------------------
+Standard:
+ free(ptr)
+InnoDB:
+ ut_free(ptr)
+
+Note: the expression passed to UT_NEW() or UT_NEW_NOKEY() must always end
+with (), thus:
+Standard:
+ new int
+InnoDB:
+ UT_NEW_NOKEY(int())
+*/
+
+#ifndef ut0new_h
+#define ut0new_h
+
+#include <limits> /* std::numeric_limits */
+#include <thread>
+
+#include <stddef.h>
+#include <stdlib.h> /* malloc() */
+#include <string.h> /* strlen(), strrchr(), strncmp() */
+
+#include <my_sys.h> /* my_large_free/malloc() */
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */
+
+#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
+
+#include "ut0ut.h" /* ut_strcmp_functor */
+
+#define OUT_OF_MEMORY_MSG \
+ "Check if you should increase the swap file or ulimits of your" \
+ " operating system. Note that on most 32-bit computers the process" \
+ " memory space is limited to 2 GB or 4 GB."
+
+/** The total amount of memory currently allocated from the operating
+system with allocate_large() */
+extern Atomic_counter<ulint> os_total_large_mem_allocated;
+
+/** Maximum number of retries to allocate memory. */
+extern const size_t alloc_max_retries;
+
+constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU;
+
+/** Keys for registering allocations with performance schema.
+Pointers to these variables are supplied to PFS code via the pfs_info[]
+array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)().
+mem_key_other and mem_key_std are special in the following way (see also
+ut_allocator::get_mem_key()):
+* If the caller has not provided a key and the file name of the caller is
+ unknown, then mem_key_std will be used. This happens only when called from
+ within std::* containers.
+* If the caller has not provided a key and the file name of the caller is
+ known, but is not amongst the predefined names (see ut_new_boot()) then
+ mem_key_other will be used. Generally this should not happen and if it
+ happens then that means that the list of predefined names must be extended.
+Keep this list alphabetically sorted. */
+extern PSI_memory_key mem_key_ahi;
+extern PSI_memory_key mem_key_buf_buf_pool;
+extern PSI_memory_key mem_key_dict_stats_bg_recalc_pool_t;
+extern PSI_memory_key mem_key_dict_stats_index_map_t;
+extern PSI_memory_key mem_key_dict_stats_n_diff_on_level;
+extern PSI_memory_key mem_key_other;
+extern PSI_memory_key mem_key_row_log_buf;
+extern PSI_memory_key mem_key_row_merge_sort;
+extern PSI_memory_key mem_key_std;
+
+/** Setup the internal objects needed for UT_NEW() to operate.
+This must be called before the first call to UT_NEW(). */
+void
+ut_new_boot();
+
+#ifdef UNIV_PFS_MEMORY
+
+/**
+Retrieve a memory key (registered with PFS),
+given AUTOEVENT_IDX of the caller
+
+@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller
+@return registered memory key or PSI_NOT_INSTRUMENTED */
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx);
+
+#endif /* UNIV_PFS_MEMORY */
+
+/** A structure that holds the necessary data for performance schema
+accounting. An object of this type is put in front of each allocated block
+of memory when allocation is done by ut_allocator::allocate(). This is
+because the data is needed even when freeing the memory. Users of
+ut_allocator::allocate_large() are responsible for maintaining this
+themselves. */
+struct ut_new_pfx_t {
+
+#ifdef UNIV_PFS_MEMORY
+
+ /** Performance schema key. Assigned to a name at startup via
+ PSI_MEMORY_CALL(register_memory)() and later used for accounting
+ allocations and deallocations with
+ PSI_MEMORY_CALL(memory_alloc)(key, size, owner) and
+ PSI_MEMORY_CALL(memory_free)(key, size, owner). */
+ PSI_memory_key m_key;
+
+ /**
+ Thread owner.
+ Instrumented thread that owns the allocated memory.
+ This state is used by the performance schema to maintain
+ per thread statistics,
+ when memory is given from thread A to thread B.
+ */
+ struct PSI_thread *m_owner;
+
+#endif /* UNIV_PFS_MEMORY */
+
+ /** Size of the allocated block in bytes, including this prepended
+ aux structure (for ut_allocator::allocate()). For example if InnoDB
+ code requests to allocate 100 bytes, and sizeof(ut_new_pfx_t) is 16,
+ then 116 bytes are allocated in total and m_size will be 116.
+ ut_allocator::allocate_large() does not prepend this struct to the
+ allocated block and its users are responsible for maintaining it
+ and passing it later to ut_allocator::deallocate_large(). */
+ size_t m_size;
+#if SIZEOF_VOIDP == 4
+ /** Pad the header size to a multiple of 64 bits on 32-bit systems,
+ so that the payload will be aligned to 64 bits. */
+ size_t pad;
+#endif
+};
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump)
+{
+ ut_a(ptr != NULL);
+
+ if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) {
+ ib::warn() << "Failed to set memory to " DONTDUMP_STR ": "
+ << strerror(errno)
+ << " ptr " << ptr
+ << " size " << m_size;
+ }
+}
+
+static inline void ut_dodump(void* ptr, size_t m_size)
+{
+ if (ptr && madvise(ptr, m_size, MADV_DODUMP)) {
+ ib::warn() << "Failed to set memory to " DODUMP_STR ": "
+ << strerror(errno)
+ << " ptr " << ptr
+ << " size " << m_size;
+ }
+}
+#else
+static inline void ut_dontdump(void *, size_t, bool) {}
+static inline void ut_dodump(void*, size_t) {}
+#endif
+
+/** Allocator class for allocating memory from inside std::* containers.
+@tparam T type of allocated object
+@tparam oom_fatal whether to commit suicide when running out of memory */
+template <class T, bool oom_fatal = true>
+class ut_allocator {
+public:
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef T value_type;
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+
+#ifdef UNIV_PFS_MEMORY
+ /** Default constructor. */
+ explicit
+ ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED)
+ : m_key(key)
+ {
+ }
+#else
+ ut_allocator() = default;
+ ut_allocator(PSI_memory_key) {}
+#endif /* UNIV_PFS_MEMORY */
+
+ /** Constructor from allocator of another type. */
+ template <class U>
+ ut_allocator(const ut_allocator<U>&
+#ifdef UNIV_PFS_MEMORY
+ other
+#endif
+ )
+ {
+#ifdef UNIV_PFS_MEMORY
+ const PSI_memory_key other_key = other.get_mem_key();
+
+ m_key = (other_key != mem_key_std)
+ ? other_key
+ : PSI_NOT_INSTRUMENTED;
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ /** Return the maximum number of objects that can be allocated by
+ this allocator. */
+ size_type
+ max_size() const
+ {
+ const size_type s_max = std::numeric_limits<size_type>::max();
+
+#ifdef UNIV_PFS_MEMORY
+ return((s_max - sizeof(ut_new_pfx_t)) / sizeof(T));
+#else
+ return(s_max / sizeof(T));
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); }
+
+ /** Allocate a chunk of memory that can hold 'n_elements' objects of
+ type 'T' and trace the allocation.
+ If the allocation fails this method may throw an exception. This
+ is mandated by the standard and if it returns NULL instead, then
+ STL containers that use it (e.g. std::vector) may get confused.
+ After successfull allocation the returned pointer must be passed
+ to ut_allocator::deallocate() when no longer needed.
+ @param[in] n_elements number of elements
+ @param[in] set_to_zero if true, then the returned memory is
+ initialized with 0x0 bytes.
+ @param[in] throw_on_error if true, raize exception if too big
+ @return pointer to the allocated memory */
+ pointer
+ allocate(
+ size_type n_elements,
+ const_pointer,
+ uint32_t
+#ifdef UNIV_PFS_MEMORY
+ autoevent_idx /* AUTOEVENT_IDX of the caller */
+#endif
+ ,
+ bool set_to_zero = false,
+ bool throw_on_error = true)
+ {
+ if (n_elements == 0) {
+ return(NULL);
+ }
+
+ if (n_elements > max_size()) {
+ if (throw_on_error) {
+ throw(std::bad_alloc());
+ } else {
+ return(NULL);
+ }
+ }
+
+ void* ptr;
+ size_t total_bytes = n_elements * sizeof(T);
+
+#ifdef UNIV_PFS_MEMORY
+ /* The header size must not ruin the 64-bit alignment
+ on 32-bit systems. Some allocated structures use
+ 64-bit fields. */
+ ut_ad((sizeof(ut_new_pfx_t) & 7) == 0);
+ total_bytes += sizeof(ut_new_pfx_t);
+#endif /* UNIV_PFS_MEMORY */
+
+ for (size_t retries = 1; ; retries++) {
+
+ if (set_to_zero) {
+ ptr = calloc(1, total_bytes);
+ } else {
+ ptr = malloc(total_bytes);
+ }
+
+ if (ptr != NULL || retries >= alloc_max_retries) {
+ break;
+ }
+
+ std::this_thread::sleep_for(std::chrono::seconds(1));
+ }
+
+ if (ptr == NULL) {
+ ib::fatal_or_error(oom_fatal)
+ << "Cannot allocate " << total_bytes
+ << " bytes of memory after "
+ << alloc_max_retries << " retries over "
+ << alloc_max_retries << " seconds. OS error: "
+ << strerror(errno) << " (" << errno << "). "
+ << OUT_OF_MEMORY_MSG;
+ if (throw_on_error) {
+ throw(std::bad_alloc());
+ } else {
+ return(NULL);
+ }
+ }
+
+#ifdef UNIV_PFS_MEMORY
+ ut_new_pfx_t* pfx = static_cast<ut_new_pfx_t*>(ptr);
+
+ allocate_trace(total_bytes, autoevent_idx, pfx);
+
+ return(reinterpret_cast<pointer>(pfx + 1));
+#else
+ return(reinterpret_cast<pointer>(ptr));
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ /** Free a memory allocated by allocate() and trace the deallocation.
+ @param[in,out] ptr pointer to memory to free */
+ void deallocate(pointer ptr, size_type n_elements = 0)
+ {
+#ifdef UNIV_PFS_MEMORY
+ if (ptr == NULL) {
+ return;
+ }
+
+ ut_new_pfx_t* pfx = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+ deallocate_trace(pfx);
+
+ free(pfx);
+#else
+ free(ptr);
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ /** Create an object of type 'T' using the value 'val' over the
+ memory pointed by 'p'. */
+ void
+ construct(
+ pointer p,
+ const T& val)
+ {
+ new(p) T(val);
+ }
+
+ /** Destroy an object pointed by 'p'. */
+ void
+ destroy(
+ pointer p)
+ {
+ p->~T();
+ }
+
+ /** Return the address of an object. */
+ pointer
+ address(
+ reference x) const
+ {
+ return(&x);
+ }
+
+ /** Return the address of a const object. */
+ const_pointer
+ address(
+ const_reference x) const
+ {
+ return(&x);
+ }
+
+ template <class U>
+ struct rebind {
+ typedef ut_allocator<U> other;
+ };
+
+ /* The following are custom methods, not required by the standard. */
+
+#ifdef UNIV_PFS_MEMORY
+
+ /** realloc(3)-like method.
+ The passed in ptr must have been returned by allocate() and the
+ pointer returned by this method must be passed to deallocate() when
+ no longer needed.
+ @param[in,out] ptr old pointer to reallocate
+ @param[in] n_elements new number of elements to allocate
+ @param[in] file file name of the caller
+ @return newly allocated memory */
+ pointer
+ reallocate(
+ void* ptr,
+ size_type n_elements,
+ uint32_t autoevent_idx)
+ {
+ if (n_elements == 0) {
+ deallocate(static_cast<pointer>(ptr));
+ return(NULL);
+ }
+
+ if (ptr == NULL) {
+ return(allocate(n_elements, NULL, autoevent_idx, false, false));
+ }
+
+ if (n_elements > max_size()) {
+ return(NULL);
+ }
+
+ ut_new_pfx_t* pfx_old;
+ ut_new_pfx_t* pfx_new;
+ size_t total_bytes;
+
+ pfx_old = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+ total_bytes = n_elements * sizeof(T) + sizeof(ut_new_pfx_t);
+
+ for (size_t retries = 1; ; retries++) {
+
+ pfx_new = static_cast<ut_new_pfx_t*>(
+ realloc(pfx_old, total_bytes));
+
+ if (pfx_new != NULL || retries >= alloc_max_retries) {
+ break;
+ }
+
+ std::this_thread::sleep_for(std::chrono::seconds(1));
+ }
+
+ if (pfx_new == NULL) {
+ ib::fatal_or_error(oom_fatal)
+ << "Cannot reallocate " << total_bytes
+ << " bytes of memory after "
+ << alloc_max_retries << " retries over "
+ << alloc_max_retries << " seconds. OS error: "
+ << strerror(errno) << " (" << errno << "). "
+ << OUT_OF_MEMORY_MSG;
+ return(NULL);
+ }
+
+ /* pfx_new still contains the description of the old block
+ that was presumably freed by realloc(). */
+ deallocate_trace(pfx_new);
+
+ /* pfx_new is set here to describe the new block. */
+ allocate_trace(total_bytes, autoevent_idx, pfx_new);
+
+ return(reinterpret_cast<pointer>(pfx_new + 1));
+ }
+
+ /** Allocate, trace the allocation and construct 'n_elements' objects
+ of type 'T'. If the allocation fails or if some of the constructors
+ throws an exception, then this method will return NULL. It does not
+ throw exceptions. After successfull completion the returned pointer
+ must be passed to delete_array() when no longer needed.
+ @param[in] n_elements number of elements to allocate
+ @param[in] file file name of the caller
+ @return pointer to the first allocated object or NULL */
+ pointer
+ new_array(
+ size_type n_elements,
+ uint32_t autoevent_idx
+ )
+ {
+ T* p = allocate(n_elements, NULL, autoevent_idx, false, false);
+
+ if (p == NULL) {
+ return(NULL);
+ }
+
+ T* first = p;
+ size_type i;
+
+ try {
+ for (i = 0; i < n_elements; i++) {
+ new(p) T;
+ ++p;
+ }
+ } catch (...) {
+ for (size_type j = 0; j < i; j++) {
+ --p;
+ p->~T();
+ }
+
+ deallocate(first);
+
+ throw;
+ }
+
+ return(first);
+ }
+
+ /** Destroy, deallocate and trace the deallocation of an array created
+ by new_array().
+ @param[in,out] ptr pointer to the first object in the array */
+ void
+ delete_array(
+ T* ptr)
+ {
+ if (ptr == NULL) {
+ return;
+ }
+
+ const size_type n_elements = n_elements_allocated(ptr);
+
+ T* p = ptr + n_elements - 1;
+
+ for (size_type i = 0; i < n_elements; i++) {
+ p->~T();
+ --p;
+ }
+
+ deallocate(ptr);
+ }
+
+#endif /* UNIV_PFS_MEMORY */
+
+ /** Allocate a large chunk of memory that can hold 'n_elements'
+ objects of type 'T' and trace the allocation.
+ @param[in] n_elements number of elements
+ @param[in] dontdump if true, advise the OS is not to core
+ dump this memory.
+ @param[out] pfx storage for the description of the
+ allocated memory. The caller must provide space for this one and keep
+ it until the memory is no longer needed and then pass it to
+ deallocate_large().
+ @return pointer to the allocated memory or NULL */
+ pointer
+ allocate_large(
+ size_type n_elements,
+ ut_new_pfx_t* pfx,
+ bool dontdump = false)
+ {
+ if (n_elements == 0 || n_elements > max_size()) {
+ return(NULL);
+ }
+
+ ulint n_bytes = n_elements * sizeof(T);
+
+ pointer ptr = reinterpret_cast<pointer>(
+ my_large_malloc(&n_bytes, MYF(0)));
+
+ if (ptr == NULL) {
+ return NULL;
+ }
+
+ ut_dontdump(ptr, n_bytes, dontdump);
+
+ if (pfx != NULL) {
+#ifdef UNIV_PFS_MEMORY
+ allocate_trace(n_bytes, 0, pfx);
+#endif /* UNIV_PFS_MEMORY */
+ pfx->m_size = n_bytes;
+ }
+
+ os_total_large_mem_allocated += n_bytes;
+
+ return(ptr);
+ }
+
+ pointer
+ allocate_large_dontdump(
+ size_type n_elements,
+ ut_new_pfx_t* pfx)
+ {
+ return allocate_large(n_elements, pfx, true);
+ }
+ /** Free a memory allocated by allocate_large() and trace the
+ deallocation.
+ @param[in,out] ptr pointer to memory to free
+ @param[in] pfx descriptor of the memory, as returned by
+ allocate_large(). */
+ void
+ deallocate_large(
+ pointer ptr,
+ const ut_new_pfx_t* pfx)
+ {
+ size_t size = pfx->m_size;
+#ifdef UNIV_PFS_MEMORY
+ if (pfx) {
+ deallocate_trace(pfx);
+ }
+#endif /* UNIV_PFS_MEMORY */
+ os_total_large_mem_allocated -= size;
+
+ my_large_free(ptr, size);
+ }
+
+ void
+ deallocate_large_dodump(
+ pointer ptr,
+ const ut_new_pfx_t* pfx)
+ {
+ ut_dodump(ptr, pfx->m_size);
+ deallocate_large(ptr, pfx);
+ }
+
+#ifdef UNIV_PFS_MEMORY
+ /** Get the performance schema key to use for tracing allocations.
+ @param[in] file file name of the caller or NULL if unknown
+ @return performance schema key */
+ PSI_memory_key
+ get_mem_key(
+ uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const
+ {
+ if (m_key != PSI_NOT_INSTRUMENTED) {
+ return(m_key);
+ }
+
+ if (autoevent_idx == INVALID_AUTOEVENT_IDX) {
+ return(mem_key_std);
+ }
+ const PSI_memory_key key = ut_new_get_key_by_file(autoevent_idx);
+
+ if (key != PSI_NOT_INSTRUMENTED) {
+ return(key);
+ }
+
+ return(mem_key_other);
+ }
+
+private:
+
+ /** Retrieve the size of a memory block allocated by new_array().
+ @param[in] ptr pointer returned by new_array().
+ @return size of memory block */
+ size_type
+ n_elements_allocated(
+ const_pointer ptr)
+ {
+ const ut_new_pfx_t* pfx
+ = reinterpret_cast<const ut_new_pfx_t*>(ptr) - 1;
+
+ const size_type user_bytes
+ = pfx->m_size - sizeof(ut_new_pfx_t);
+
+ ut_ad(user_bytes % sizeof(T) == 0);
+
+ return(user_bytes / sizeof(T));
+ }
+
+ /** Trace a memory allocation.
+ After the accounting, the data needed for tracing the deallocation
+ later is written into 'pfx'.
+ The PFS event name is picked on the following criteria:
+ 1. If key (!= PSI_NOT_INSTRUMENTED) has been specified when constructing
+ this ut_allocator object, then the name associated with that key will
+ be used (this is the recommended approach for new code)
+ 2. Otherwise, if "file" is NULL, then the name associated with
+ mem_key_std will be used
+ 3. Otherwise, if an entry is found by ut_new_get_key_by_file(), that
+ corresponds to "file", that will be used (see ut_new_boot())
+ 4. Otherwise, the name associated with mem_key_other will be used.
+ @param[in] size number of bytes that were allocated
+ @param[in] autoevent_idx autoevent_idx of the caller
+ @param[out] pfx placeholder to store the info which will be
+ needed when freeing the memory */
+ void
+ allocate_trace(
+ size_t size,
+ const uint32_t autoevent_idx,
+ ut_new_pfx_t* pfx)
+ {
+ const PSI_memory_key key = get_mem_key(autoevent_idx);
+
+ pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner);
+ pfx->m_size = size;
+ }
+
+ /** Trace a memory deallocation.
+ @param[in] pfx info for the deallocation */
+ void
+ deallocate_trace(
+ const ut_new_pfx_t* pfx)
+ {
+ PSI_MEMORY_CALL(memory_free)(pfx->m_key, pfx->m_size, pfx->m_owner);
+ }
+
+ /** Performance schema key. */
+ PSI_memory_key m_key;
+
+#endif /* UNIV_PFS_MEMORY */
+
+private:
+
+ /** Assignment operator, not used, thus disabled (private). */
+ template <class U>
+ void
+ operator=(
+ const ut_allocator<U>&);
+};
+
+/** Compare two allocators of the same type.
+As long as the type of A1 and A2 is the same, a memory allocated by A1
+could be freed by A2 even if the pfs mem key is different. */
+template <typename T>
+inline
+bool
+operator==(const ut_allocator<T>&, const ut_allocator<T>&) { return(true); }
+
+/** Compare two allocators of the same type. */
+template <typename T>
+inline
+bool
+operator!=(
+ const ut_allocator<T>& lhs,
+ const ut_allocator<T>& rhs)
+{
+ return(!(lhs == rhs));
+}
+
+#ifdef UNIV_PFS_MEMORY
+
+/*
+ constexpr trickery ahead.
+
+ Compute AUTOEVENT_IDX at compile time.
+ (index in the auto_event_names array, corresponding to basename of __FILE__)
+
+ The tricks are necessary to reduce the cost of lookup the
+ PSI_memory_key for auto event.
+*/
+
+static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash)
+{
+ return
+ *s == '\0' ? last_slash :
+ *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) :
+ cexpr_basename_helper(s + 1, last_slash);
+}
+
+static constexpr const char* cexpr_basename(const char* filename)
+{
+ return cexpr_basename_helper(filename, filename);
+}
+
+static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b)
+{
+ return *a == 0 || *a == '.' ? (*b == 0 || *b == '.')
+ : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false;
+}
+
+constexpr const char* const auto_event_names[] =
+{
+ "btr0btr",
+ "btr0buf",
+ "btr0bulk",
+ "btr0cur",
+ "btr0pcur",
+ "btr0sea",
+ "buf0buf",
+ "buf0dblwr",
+ "buf0dump",
+ "buf0lru",
+ "buf0rea",
+ "dict0dict",
+ "dict0mem",
+ "dict0stats",
+ "eval0eval",
+ "fil0crypt",
+ "fil0fil",
+ "fsp0file",
+ "fts0ast",
+ "fts0blex",
+ "fts0config",
+ "fts0file",
+ "fts0fts",
+ "fts0opt",
+ "fts0pars",
+ "fts0que",
+ "fts0sql",
+ "fts0tlex",
+ "gis0sea",
+ "ha_innodb",
+ "handler0alter",
+ "hash0hash",
+ "i_s",
+ "lexyy",
+ "lock0lock",
+ "mem0mem",
+ "os0file",
+ "pars0lex",
+ "rem0rec",
+ "row0ftsort",
+ "row0import",
+ "row0log",
+ "row0merge",
+ "row0mysql",
+ "row0sel",
+ "srv0start",
+ "trx0i_s",
+ "trx0i_s",
+ "trx0roll",
+ "trx0rseg",
+ "trx0seg",
+ "trx0trx",
+ "trx0undo",
+ "ut0list",
+ "ut0mem",
+ "ut0new",
+ "ut0pool",
+ "ut0rbt",
+ "ut0wqueue",
+ "xtrabackup",
+ nullptr
+};
+
+constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0)
+{
+ return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX :
+ cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx :
+ cexpr_lookup_auto_event_name(name, idx + 1);
+}
+
+/*
+ The AUTOEVENT_IDX macro.
+
+ Note, that there is a static_assert that checks whether
+ basename of the __FILE is not registered in the auto_event_names array.
+ If you run into this assert, add the basename to the array.
+
+ Weird looking lambda is used to force the evaluation at the compile time.
+*/
+#define AUTOEVENT_IDX []()\
+{\
+ constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \
+ static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\
+ return idx; \
+}()
+
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB.
+For example: instead of
+ Foo* f = new Foo(args);
+use:
+ Foo* f = UT_NEW(Foo(args), mem_key_some);
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in] expr any expression that could follow "new"
+@param[in] key performance schema memory tracing key
+@return pointer to the created object or NULL */
+#define UT_NEW(expr, key) \
+ /* Placement new will return NULL and not attempt to construct an
+ object if the passed in pointer is NULL, e.g. if allocate() has
+ failed to allocate memory and has returned NULL. */ \
+ ::new(ut_allocator<byte>(key).allocate( \
+ sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB and instead of UT_NEW()
+when creating a dedicated memory key is not feasible.
+For example: instead of
+ Foo* f = new Foo(args);
+use:
+ Foo* f = UT_NEW_NOKEY(Foo(args));
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in] expr any expression that could follow "new"
+@return pointer to the created object or NULL */
+#define UT_NEW_NOKEY(expr) UT_NEW(expr, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an object created by
+UT_NEW() or UT_NEW_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE(ptr) ut_delete(ptr)
+
+
+/** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY().
+@param[in,out] ptr pointer to the object */
+template <typename T>
+inline
+void
+ut_delete(
+ T* ptr)
+{
+ if (ptr == NULL) {
+ return;
+ }
+
+ ut_allocator<T> allocator;
+
+ allocator.destroy(ptr);
+ allocator.deallocate(ptr);
+}
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]'.
+The returned pointer must be passed to UT_DELETE_ARRAY().
+@param[in] type type of objects being created
+@param[in] n_elements number of objects to create
+@param[in] key performance schema memory tracing key
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY(type, n_elements, key) \
+ ut_allocator<type>(key).new_array(n_elements, AUTOEVENT_IDX)
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]' and
+instead of UT_NEW_ARRAY() when it is not feasible to create a dedicated key.
+@param[in] type type of objects being created
+@param[in] n_elements number of objects to create
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+ UT_NEW_ARRAY(type, n_elements, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an array created by
+UT_NEW_ARRAY() or UT_NEW_ARRAY_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE_ARRAY(ptr) ut_delete_array(ptr)
+
+/** Destroy and account objects created by UT_NEW_ARRAY() or
+UT_NEW_ARRAY_NOKEY().
+@param[in,out] ptr pointer to the first object in the array */
+template <typename T>
+inline
+void
+ut_delete_array(
+ T* ptr)
+{
+ ut_allocator<T>().delete_array(ptr);
+}
+
+#define ut_malloc(n_bytes, key) static_cast<void*>( \
+ ut_allocator<byte>(key).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_malloc_dontdump(n_bytes, key) static_cast<void*>( \
+ ut_allocator<byte>(key).allocate_large( \
+ n_bytes, NULL, true))
+
+#define ut_zalloc(n_bytes, key) static_cast<void*>( \
+ ut_allocator<byte>(key).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_malloc_nokey(n_bytes) static_cast<void*>( \
+ ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_zalloc_nokey(n_bytes) static_cast<void*>( \
+ ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_zalloc_nokey_nofatal(n_bytes) static_cast<void*>( \
+ ut_allocator<byte, false>(PSI_NOT_INSTRUMENTED).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_realloc(ptr, n_bytes) static_cast<void*>( \
+ ut_allocator<byte>(PSI_NOT_INSTRUMENTED).reallocate( \
+ ptr, n_bytes, AUTOEVENT_IDX))
+
+#define ut_free(ptr) ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \
+ reinterpret_cast<byte*>(ptr))
+
+#else /* UNIV_PFS_MEMORY */
+
+/* Fallbacks when memory tracing is disabled at compile time. */
+
+#define UT_NEW(expr, key) ::new(std::nothrow) expr
+#define UT_NEW_NOKEY(expr) ::new(std::nothrow) expr
+#define UT_DELETE(ptr) ::delete ptr
+
+#define UT_NEW_ARRAY(type, n_elements, key) \
+ ::new(std::nothrow) type[n_elements]
+
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+ ::new(std::nothrow) type[n_elements]
+
+#define UT_DELETE_ARRAY(ptr) ::delete[] ptr
+
+#define ut_malloc(n_bytes, key) ::malloc(n_bytes)
+
+#define ut_zalloc(n_bytes, key) ::calloc(1, n_bytes)
+
+#define ut_malloc_nokey(n_bytes) ::malloc(n_bytes)
+
+static inline void *ut_malloc_dontdump(size_t n_bytes, ...)
+{
+ void *ptr = my_large_malloc(&n_bytes, MYF(0));
+
+ ut_dontdump(ptr, n_bytes, true);
+
+ if (ptr) {
+ os_total_large_mem_allocated += n_bytes;
+ }
+ return ptr;
+}
+
+#define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes)
+
+#define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes)
+
+#define ut_realloc(ptr, n_bytes) ::realloc(ptr, n_bytes)
+
+#define ut_free(ptr) ::free(ptr)
+
+#endif /* UNIV_PFS_MEMORY */
+
+static inline void ut_free_dodump(void *ptr, size_t size)
+{
+ ut_dodump(ptr, size);
+ os_total_large_mem_allocated -= size;
+ my_large_free(ptr, size);
+}
+
+#endif /* ut0new_h */
diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h
new file mode 100644
index 00000000..aa0cfb9e
--- /dev/null
+++ b/storage/innobase/include/ut0pool.h
@@ -0,0 +1,365 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0pool.h
+Object pool.
+
+Created 2012-Feb-26 Sunny Bains
+***********************************************************************/
+
+#ifndef ut0pool_h
+#define ut0pool_h
+
+#include <vector>
+#include <queue>
+#include <functional>
+
+#include <my_global.h>
+
+/** Allocate the memory for the object in blocks. We keep the objects sorted
+on pointer so that they are closer together in case they have to be iterated
+over in a list. */
+template <typename Type, typename Factory, typename LockStrategy>
+struct Pool {
+
+ typedef Type value_type;
+
+ struct Element {
+ Pool* m_pool;
+ value_type m_type;
+ };
+
+ /** Constructor
+ @param size size of the memory block */
+ Pool(size_t size)
+ :
+ m_end(),
+ m_start(),
+ m_size(size),
+ m_last()
+ {
+ ut_ad(ut_is_2pow(size));
+ ut_a(size >= sizeof(Element));
+ static_assert(!(sizeof(Element) % CPU_LEVEL1_DCACHE_LINESIZE),
+ "alignment");
+
+ m_lock_strategy.create();
+
+ ut_a(m_start == 0);
+
+ m_start = static_cast<Element*>(
+ aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE));
+ memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(
+ m_start, 0, m_size);
+
+ m_last = m_start;
+
+ m_end = &m_start[m_size / sizeof *m_start];
+
+ /* Note: Initialise only a small subset, even though we have
+ allocated all the memory. This is required only because PFS
+ (MTR) results change if we instantiate too many mutexes up
+ front. */
+
+ init(ut_min(size_t(16), size_t(m_end - m_start)));
+
+ ut_ad(m_pqueue.size() <= size_t(m_last - m_start));
+ }
+
+ /** Destructor */
+ ~Pool()
+ {
+ m_lock_strategy.destroy();
+
+ for (Element* elem = m_start; elem != m_last; ++elem) {
+
+ ut_ad(elem->m_pool == this);
+ Factory::destroy(&elem->m_type);
+ }
+
+ IF_WIN(_aligned_free,free)(m_start);
+ m_end = m_last = m_start = 0;
+ m_size = 0;
+ }
+
+ /** Get an object from the pool.
+ @retrun a free instance or NULL if exhausted. */
+ Type* get()
+ {
+ Element* elem;
+
+ m_lock_strategy.enter();
+
+ if (!m_pqueue.empty()) {
+
+ elem = m_pqueue.top();
+ m_pqueue.pop();
+
+ } else if (m_last < m_end) {
+
+ /* Initialise the remaining elements. */
+ init(size_t(m_end - m_last));
+
+ ut_ad(!m_pqueue.empty());
+
+ elem = m_pqueue.top();
+ m_pqueue.pop();
+ } else {
+ elem = NULL;
+ }
+
+ m_lock_strategy.exit();
+ return elem ? &elem->m_type : NULL;
+ }
+
+ /** Add the object to the pool.
+ @param ptr object to free */
+ static void mem_free(value_type* ptr)
+ {
+ Element* elem;
+ byte* p = reinterpret_cast<byte*>(ptr + 1);
+
+ elem = reinterpret_cast<Element*>(p - sizeof(*elem));
+
+ elem->m_pool->m_lock_strategy.enter();
+
+ elem->m_pool->putl(elem);
+
+ elem->m_pool->m_lock_strategy.exit();
+ }
+
+protected:
+ // Disable copying
+ Pool(const Pool&);
+ Pool& operator=(const Pool&);
+
+private:
+
+ /* We only need to compare on pointer address. */
+ typedef std::priority_queue<
+ Element*,
+ std::vector<Element*, ut_allocator<Element*> >,
+ std::greater<Element*> > pqueue_t;
+
+ /** Release the object to the free pool
+ @param elem element to free */
+ void putl(Element* elem)
+ {
+ ut_ad(elem >= m_start && elem < m_last);
+ m_pqueue.push(elem);
+ }
+
+ /** Initialise the elements.
+ @param n_elems Number of elements to initialise */
+ void init(size_t n_elems)
+ {
+ ut_ad(size_t(m_end - m_last) >= n_elems);
+
+ for (size_t i = 0; i < n_elems; ++i, ++m_last) {
+
+ m_last->m_pool = this;
+ Factory::init(&m_last->m_type);
+ m_pqueue.push(m_last);
+ }
+
+ ut_ad(m_last <= m_end);
+ }
+
+private:
+ /** Pointer to the last element */
+ Element* m_end;
+
+ /** Pointer to the first element */
+ Element* m_start;
+
+ /** Size of the block in bytes */
+ size_t m_size;
+
+ /** Upper limit of used space */
+ Element* m_last;
+
+ /** Priority queue ordered on the pointer addresse. */
+ pqueue_t m_pqueue;
+
+ /** Lock strategy to use */
+ LockStrategy m_lock_strategy;
+};
+
+template <typename Pool, typename LockStrategy>
+struct PoolManager {
+
+ typedef Pool PoolType;
+ typedef typename PoolType::value_type value_type;
+
+ PoolManager(size_t size)
+ :
+ m_size(size)
+ {
+ create();
+ }
+
+ ~PoolManager()
+ {
+ destroy();
+
+ ut_a(m_pools.empty());
+ }
+
+ /** Get an element from one of the pools.
+ @return instance or NULL if pool is empty. */
+ value_type* get()
+ {
+ size_t index = 0;
+ size_t delay = 1;
+ value_type* ptr = NULL;
+
+ do {
+ m_lock_strategy.enter();
+
+ ut_ad(!m_pools.empty());
+
+ size_t n_pools = m_pools.size();
+
+ PoolType* pool = m_pools[index % n_pools];
+
+ m_lock_strategy.exit();
+
+ ptr = pool->get();
+
+ if (ptr == 0 && (index / n_pools) > 2) {
+
+ if (!add_pool(n_pools)) {
+
+ ib::error() << "Failed to allocate"
+ " memory for a pool of size "
+ << m_size << " bytes. Will"
+ " wait for " << delay
+ << " seconds for a thread to"
+ " free a resource";
+
+ /* There is nothing much we can do
+ except crash and burn, however lets
+ be a little optimistic and wait for
+ a resource to be freed. */
+ std::this_thread::sleep_for(
+ std::chrono::seconds(delay));
+
+ if (delay < 32) {
+ delay <<= 1;
+ }
+
+ } else {
+ delay = 1;
+ }
+ }
+
+ ++index;
+
+ } while (ptr == NULL);
+
+ return(ptr);
+ }
+
+ static void mem_free(value_type* ptr)
+ {
+ PoolType::mem_free(ptr);
+ }
+
+private:
+ /** Add a new pool
+ @param n_pools Number of pools that existed when the add pool was
+ called.
+ @return true on success */
+ bool add_pool(size_t n_pools)
+ {
+ bool added = false;
+
+ m_lock_strategy.enter();
+
+ if (n_pools < m_pools.size()) {
+ /* Some other thread already added a pool. */
+ added = true;
+ } else {
+ PoolType* pool;
+
+ ut_ad(n_pools == m_pools.size());
+
+ pool = UT_NEW_NOKEY(PoolType(m_size));
+
+ if (pool != NULL) {
+ m_pools.push_back(pool);
+
+ ib::info() << "Number of transaction pools: "
+ << m_pools.size();
+
+ added = true;
+ }
+ }
+
+ ut_ad(n_pools < m_pools.size() || !added);
+
+ m_lock_strategy.exit();
+
+ return(added);
+ }
+
+ /** Create the pool manager. */
+ void create()
+ {
+ ut_a(m_size > sizeof(value_type));
+ m_lock_strategy.create();
+
+ add_pool(0);
+ }
+
+ /** Release the resources. */
+ void destroy()
+ {
+ typename Pools::iterator it;
+ typename Pools::iterator end = m_pools.end();
+
+ for (it = m_pools.begin(); it != end; ++it) {
+ PoolType* pool = *it;
+
+ UT_DELETE(pool);
+ }
+
+ m_pools.clear();
+
+ m_lock_strategy.destroy();
+ }
+private:
+ // Disable copying
+ PoolManager(const PoolManager&);
+ PoolManager& operator=(const PoolManager&);
+
+ typedef std::vector<PoolType*, ut_allocator<PoolType*> > Pools;
+
+ /** Size of each block */
+ size_t m_size;
+
+ /** Pools managed this manager */
+ Pools m_pools;
+
+ /** Lock strategy to use */
+ LockStrategy m_lock_strategy;
+};
+
+#endif /* ut0pool_h */
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
new file mode 100644
index 00000000..38071165
--- /dev/null
+++ b/storage/innobase/include/ut0rbt.h
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************************//**
+@file include/ut0rbt.h
+Various utilities
+
+Created 2007-03-20 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define ut_malloc malloc
+#define ut_free free
+#define ulint unsigned long
+#define ut_a(c) assert(c)
+#define ut_error assert(0)
+#define ibool unsigned int
+#define TRUE 1
+#define FALSE 0
+#endif
+
+struct ib_rbt_node_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
+
+/** Red black tree color types */
+enum ib_rbt_color_t {
+ IB_RBT_RED,
+ IB_RBT_BLACK
+};
+
+/** Red black tree node */
+struct ib_rbt_node_t {
+ ib_rbt_color_t color; /* color of this node */
+
+ ib_rbt_node_t* left; /* points left child */
+ ib_rbt_node_t* right; /* points right child */
+ ib_rbt_node_t* parent; /* points parent node */
+
+ char value[1]; /* Data value */
+};
+
+/** Red black tree instance.*/
+struct ib_rbt_t {
+ ib_rbt_node_t* nil; /* Black colored node that is
+ used as a sentinel. This is
+ pre-allocated too.*/
+
+ ib_rbt_node_t* root; /* Root of the tree, this is
+ pre-allocated and the first
+ data node is the left child.*/
+
+ ulint n_nodes; /* Total number of data nodes */
+
+ ib_rbt_compare compare; /* Fn. to use for comparison */
+ ib_rbt_arg_compare
+ compare_with_arg; /* Fn. to use for comparison
+ with argument */
+ ulint sizeof_value; /* Sizeof the item in bytes */
+ void* cmp_arg; /* Compare func argument */
+};
+
+/** The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_t {
+ const ib_rbt_node_t*
+ last; /* Last node visited */
+
+ int result; /* Result of comparing with
+ the last non-nil node that
+ was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t) (t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t) (rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/**********************************************************************//**
+Free an instance of a red black tree */
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree); /*!< in: rb tree to free */
+/**********************************************************************//**
+Create an instance of a red black tree
+@return rb tree instance */
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_compare compare); /*!< in: comparator */
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return rb tree instance */
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_arg_compare
+ compare, /*!< in: comparator */
+ void* cmp_arg); /*!< in: compare fn arg */
+/**********************************************************************//**
+Delete a node from the red black tree, identified by key */
+ibool
+rbt_delete(
+/*=======*/
+ /* in: TRUE on success */
+ ib_rbt_t* tree, /* in: rb tree */
+ const void* key); /* in: key to delete */
+/**********************************************************************//**
+Remove a node from the red black tree, NOTE: This function will not delete
+the node instance, THAT IS THE CALLERS RESPONSIBILITY.
+@return the deleted node with the const. */
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t*
+ node); /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller has access
+ only to const nodes.*/
+/**********************************************************************//**
+Add data to the red black tree, identified by key (no dups yet!)
+@return inserted node */
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value); /*!< in: data that will be
+ copied to the node.*/
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ const void* value); /*!< in: this value is copied
+ to the node */
+/**********************************************************************//**
+Return the left most data node in the tree
+@return left most node */
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/**********************************************************************//**
+Return the right most data node in the tree
+@return right most node */
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/**********************************************************************//**
+Return the next node from current.
+@return successor node to current that is passed in. */
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /* in: current node */
+ current);
+/**********************************************************************//**
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /* in: current node */
+ current);
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key); /*!< in: key to search */
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare, /*!< in: comparator */
+ ib_rbt_arg_compare
+ arg_compare); /*!< in: fn to compare items
+ with argument */
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src); /*!< in: src rb tree */
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree); /*!< in: tree to validate */
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
new file mode 100644
index 00000000..511eb21f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.h
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "ut0byte.h"
+#include <my_sys.h>
+
+#ifndef UNIV_INNOCHECKSUM
+/** Seed value of ut_rnd_gen() */
+extern std::atomic<uint32_t> ut_rnd_current;
+
+/** @return a pseudo-random 32-bit number */
+inline uint32_t ut_rnd_gen()
+{
+ /* This is a Galois linear-feedback shift register.
+ https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Galois_LFSRs
+ The generating primitive Galois Field polynomial is the Castagnoli
+ polynomial that was made popular by CRC-32C:
+ x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+
+ x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */
+ const uint32_t crc32c= 0x1edc6f41;
+
+ uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed);
+
+ if (UNIV_UNLIKELY(rnd == 0))
+ {
+ rnd= static_cast<uint32_t>(my_interval_timer());
+ if (!rnd) rnd= 1;
+ }
+ else
+ {
+ bool lsb= rnd & 1;
+ rnd>>= 1;
+ if (lsb)
+ rnd^= crc32c;
+ }
+
+ ut_rnd_current.store(rnd, std::memory_order_relaxed);
+ return rnd;
+}
+
+/** @return a random number between 0 and n-1, inclusive */
+inline ulint ut_rnd_interval(ulint n)
+{
+ return n > 1 ? static_cast<ulint>(ut_rnd_gen() % n) : 0;
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size); /*!< in: hash table size */
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+ ib_uint64_t d) /*!< in: 64-bit integer */
+ MY_ATTRIBUTE((const));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+ulint
+ut_find_prime(
+/*==========*/
+ ulint n) /*!< in: positive number > 100 */
+ MY_ATTRIBUTE((const));
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+ MY_ATTRIBUTE((const));
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+ MY_ATTRIBUTE((pure));
+
+#include "ut0rnd.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0rnd.inl b/storage/innobase/include/ut0rnd.inl
new file mode 100644
index 00000000..37da323f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.inl
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK 1463735687
+#define UT_HASH_RANDOM_MASK2 1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size) /*!< in: hash table size */
+{
+ ut_ad(table_size);
+ key = key ^ UT_HASH_RANDOM_MASK2;
+
+ return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+ ib_uint64_t d) /*!< in: 64-bit integer */
+{
+ return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
+ (ulint) (d >> 32)));
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+{
+ return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+ ^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+{
+ ulint fold = 0;
+ const byte* str_end = str + (len & 0xFFFFFFF8);
+
+ ut_ad(str || !len);
+
+ while (str < str_end) {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ }
+
+ switch (len & 0x7) {
+ case 7:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 6:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 5:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 4:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 3:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 2:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 1:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ }
+
+ return(fold);
+}
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
new file mode 100644
index 00000000..4f1d4c04
--- /dev/null
+++ b/storage/innobase/include/ut0sort.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+ ulint ut_sort_mid77;\
+ ulint ut_sort_i77;\
+ ulint ut_sort_low77;\
+ ulint ut_sort_high77;\
+\
+ ut_ad((LOW) < (HIGH));\
+ ut_ad(ARR);\
+ ut_ad(AUX_ARR);\
+\
+ if ((LOW) == (HIGH) - 1) {\
+ return;\
+ } else if ((LOW) == (HIGH) - 2) {\
+ if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+ (AUX_ARR)[LOW] = (ARR)[LOW];\
+ (ARR)[LOW] = (ARR)[(HIGH) - 1];\
+ (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+ }\
+ return;\
+ }\
+\
+ ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+ SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+ SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+ ut_sort_low77 = (LOW);\
+ ut_sort_high77 = ut_sort_mid77;\
+\
+ for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+ if (ut_sort_low77 >= ut_sort_mid77) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else if (ut_sort_high77 >= (HIGH)) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ } else if (CMP_FUN((ARR)[ut_sort_low77],\
+ (ARR)[ut_sort_high77]) > 0) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ }\
+ }\
+\
+ memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+ ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h
new file mode 100644
index 00000000..17fbd91b
--- /dev/null
+++ b/storage/innobase/include/ut0stage.h
@@ -0,0 +1,499 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0stage.h
+Supplementary code to performance schema stage instrumentation.
+
+Created Nov 12, 2014 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0stage_h
+#define ut0stage_h
+
+#include <algorithm>
+#include <math.h>
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */
+#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */
+
+#include "dict0mem.h" /* dict_index_t */
+#include "row0log.h" /* row_log_estimate_work() */
+#include "srv0srv.h" /* ut_stage_alter_t */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+
+/** Class used to report ALTER TABLE progress via performance_schema.
+The only user of this class is the ALTER TABLE code and it calls the methods
+in the following order
+constructor
+begin_phase_read_pk()
+ multiple times:
+ n_pk_recs_inc() // once per record read
+ inc() // once per page read
+end_phase_read_pk()
+if any new indexes are being added, for each one:
+ begin_phase_sort()
+ multiple times:
+ inc() // once per record sorted
+ begin_phase_insert()
+ multiple times:
+ inc() // once per record inserted
+ being_phase_log_index()
+ multiple times:
+ inc() // once per log-block applied
+begin_phase_log_table()
+ multiple times:
+ inc() // once per log-block applied
+begin_phase_end()
+destructor
+
+This class knows the specifics of each phase and tries to increment the
+progress in an even manner across the entire ALTER TABLE lifetime. */
+class ut_stage_alter_t {
+public:
+ /** Constructor.
+ @param[in] pk primary key of the old table */
+ explicit
+ ut_stage_alter_t(
+ const dict_index_t* pk)
+ :
+ m_progress(NULL),
+ m_pk(pk),
+ m_n_pk_recs(0),
+ m_n_pk_pages(0),
+ m_n_recs_processed(0),
+ m_cur_phase(NOT_STARTED)
+ {
+ }
+
+ /** Destructor. */
+ ~ut_stage_alter_t();
+
+ /** Flag an ALTER TABLE start (read primary key phase).
+ @param[in] n_sort_indexes number of indexes that will be sorted
+ during ALTER TABLE, used for estimating the total work to be done */
+ void
+ begin_phase_read_pk(
+ ulint n_sort_indexes);
+
+ /** Increment the number of records in PK (table) with 1.
+ This is used to get more accurate estimate about the number of
+ records per page which is needed because some phases work on
+ per-page basis while some work on per-record basis and we want
+ to get the progress as even as possible. */
+ void
+ n_pk_recs_inc();
+
+ /** Flag either one record or one page processed, depending on the
+ current phase.
+ @param[in] inc_val flag this many units processed at once */
+ void
+ inc(
+ ulint inc_val = 1);
+
+ /** Flag the end of reading of the primary key.
+ Here we know the exact number of pages and records and calculate
+ the number of records per page and refresh the estimate. */
+ void
+ end_phase_read_pk();
+
+ /** Flag the beginning of the sort phase.
+ @param[in] sort_multi_factor since merge sort processes
+ one page more than once we only update the estimate once per this
+ many pages processed. */
+ void
+ begin_phase_sort(
+ double sort_multi_factor);
+
+ /** Flag the beginning of the insert phase. */
+ void
+ begin_phase_insert();
+
+ /** Flag the beginning of the log index phase. */
+ void
+ begin_phase_log_index();
+
+ /** Flag the beginning of the log table phase. */
+ void
+ begin_phase_log_table();
+
+ /** Flag the beginning of the end phase. */
+ void
+ begin_phase_end();
+
+private:
+
+ /** Update the estimate of total work to be done. */
+ void
+ reestimate();
+
+ /** Change the current phase.
+ @param[in] new_stage pointer to the new stage to change to */
+ void
+ change_phase(
+ const PSI_stage_info* new_stage);
+
+ /** Performance schema accounting object. */
+ PSI_stage_progress* m_progress;
+
+ /** Old table PK. Used for calculating the estimate. */
+ const dict_index_t* m_pk;
+
+ /** Number of records in the primary key (table), including delete
+ marked records. */
+ ulint m_n_pk_recs;
+
+ /** Number of leaf pages in the primary key. */
+ ulint m_n_pk_pages;
+
+ /** Estimated number of records per page in the primary key. */
+ double m_n_recs_per_page;
+
+ /** Number of indexes that are being added. */
+ ulint m_n_sort_indexes;
+
+ /** During the sort phase, increment the counter once per this
+ many pages processed. This is because sort processes one page more
+ than once. */
+ ulint m_sort_multi_factor;
+
+ /** Number of records processed during sort & insert phases. We
+ need to increment the counter only once page, or once per
+ recs-per-page records. */
+ ulint m_n_recs_processed;
+
+ /** Current phase. */
+ enum {
+ NOT_STARTED = 0,
+ READ_PK = 1,
+ SORT = 2,
+ INSERT = 3,
+ /* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
+ LOG_INDEX = 5,
+ LOG_TABLE = 6, */
+ LOG_INNODB_INDEX = 5,
+ LOG_INNODB_TABLE = 6,
+ END = 7,
+ } m_cur_phase;
+};
+
+/** Destructor. */
+inline
+ut_stage_alter_t::~ut_stage_alter_t()
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ /* Set completed = estimated before we quit. */
+ mysql_stage_set_work_completed(
+ m_progress,
+ mysql_stage_get_work_estimated(m_progress));
+
+ mysql_end_stage();
+}
+
+/** Flag an ALTER TABLE start (read primary key phase).
+@param[in] n_sort_indexes number of indexes that will be sorted
+during ALTER TABLE, used for estimating the total work to be done */
+inline
+void
+ut_stage_alter_t::begin_phase_read_pk(
+ ulint n_sort_indexes)
+{
+ m_n_sort_indexes = n_sort_indexes;
+
+ m_cur_phase = READ_PK;
+
+ m_progress = mysql_set_stage(
+ srv_stage_alter_table_read_pk_internal_sort.m_key);
+
+ mysql_stage_set_work_completed(m_progress, 0);
+ reestimate();
+}
+
+/** Increment the number of records in PK (table) with 1.
+This is used to get more accurate estimate about the number of
+records per page which is needed because some phases work on
+per-page basis while some work on per-record basis and we want
+to get the progress as even as possible. */
+inline
+void
+ut_stage_alter_t::n_pk_recs_inc()
+{
+ m_n_pk_recs++;
+}
+
+/** Flag either one record or one page processed, depending on the
+current phase. */
+inline
+void
+ut_stage_alter_t::inc(ulint inc_val)
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ ulint multi_factor = 1;
+ bool should_proceed = true;
+
+ switch (m_cur_phase) {
+ case NOT_STARTED:
+ ut_error;
+ case READ_PK:
+ m_n_pk_pages++;
+ ut_ad(inc_val == 1);
+ /* Overall the read pk phase will read all the pages from the
+ PK and will do work, proportional to the number of added
+ indexes, thus when this is called once per read page we
+ increment with 1 + m_n_sort_indexes */
+ inc_val = 1 + m_n_sort_indexes;
+ break;
+ case SORT:
+ multi_factor = m_sort_multi_factor;
+ /* fall through */
+ case INSERT: {
+ /* Increment the progress every nth record. During
+ sort and insert phases, this method is called once per
+ record processed. We need fractional point numbers here
+ because "records per page" is such a number naturally and
+ to avoid rounding skew we want, for example: if there are
+ (double) N records per page, then the work_completed
+ should be incremented on the inc() calls round(k*N),
+ for k=1,2,3... */
+ const double every_nth = m_n_recs_per_page *
+ static_cast<double>(multi_factor);
+
+ const ulint k = static_cast<ulint>(
+ round(static_cast<double>(m_n_recs_processed) /
+ every_nth));
+
+ const ulint nth = static_cast<ulint>(
+ round(static_cast<double>(k) * every_nth));
+
+ should_proceed = m_n_recs_processed == nth;
+
+ m_n_recs_processed++;
+
+ break;
+ }
+ /* JAN: TODO: MySQL 5.7
+ case LOG_INDEX:
+ break;
+ case LOG_TABLE:
+ break; */
+ case LOG_INNODB_INDEX:
+ case LOG_INNODB_TABLE:
+ break;
+ case END:
+ break;
+ }
+
+ if (should_proceed) {
+ mysql_stage_inc_work_completed(m_progress, inc_val);
+ reestimate();
+ }
+}
+
+/** Flag the end of reading of the primary key.
+Here we know the exact number of pages and records and calculate
+the number of records per page and refresh the estimate. */
+inline
+void
+ut_stage_alter_t::end_phase_read_pk()
+{
+ reestimate();
+
+ if (m_n_pk_pages == 0) {
+ /* The number of pages in the PK could be 0 if the tree is
+ empty. In this case we set m_n_recs_per_page to 1 to avoid
+ division by zero later. */
+ m_n_recs_per_page = 1.0;
+ } else {
+ m_n_recs_per_page = std::max(
+ static_cast<double>(m_n_pk_recs)
+ / static_cast<double>(m_n_pk_pages),
+ 1.0);
+ }
+}
+
+/** Flag the beginning of the sort phase.
+@param[in] sort_multi_factor since merge sort processes
+one page more than once we only update the estimate once per this
+many pages processed. */
+inline
+void
+ut_stage_alter_t::begin_phase_sort(
+ double sort_multi_factor)
+{
+ if (sort_multi_factor <= 1.0) {
+ m_sort_multi_factor = 1;
+ } else {
+ m_sort_multi_factor = static_cast<ulint>(
+ round(sort_multi_factor));
+ }
+
+ change_phase(&srv_stage_alter_table_merge_sort);
+}
+
+/** Flag the beginning of the insert phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_insert()
+{
+ change_phase(&srv_stage_alter_table_insert);
+}
+
+/** Flag the beginning of the log index phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_index()
+{
+ change_phase(&srv_stage_alter_table_log_index);
+}
+
+/** Flag the beginning of the log table phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_table()
+{
+ change_phase(&srv_stage_alter_table_log_table);
+}
+
+/** Flag the beginning of the end phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_end()
+{
+ change_phase(&srv_stage_alter_table_end);
+}
+
+/** Update the estimate of total work to be done. */
+inline
+void
+ut_stage_alter_t::reestimate()
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ /* During the log table phase we calculate the estimate as
+ work done so far + log size remaining. */
+ if (m_cur_phase == LOG_INNODB_TABLE) {
+ mysql_stage_set_work_estimated(
+ m_progress,
+ mysql_stage_get_work_completed(m_progress)
+ + row_log_estimate_work(m_pk));
+ return;
+ }
+
+ /* During the other phases we use a formula, regardless of
+ how much work has been done so far. */
+
+ /* For number of pages in the PK - if the PK has not been
+ read yet, use stat_n_leaf_pages (approximate), otherwise
+ use the exact number we gathered. */
+ const ulint n_pk_pages
+ = m_cur_phase != READ_PK
+ ? m_n_pk_pages
+ : m_pk->stat_n_leaf_pages;
+
+ ulonglong estimate __attribute__((unused))
+ = n_pk_pages
+ * (1 /* read PK */
+ + m_n_sort_indexes /* row_merge_buf_sort() inside the
+ read PK per created index */
+ + m_n_sort_indexes * 2 /* sort & insert per created index */)
+ + row_log_estimate_work(m_pk);
+
+ /* Prevent estimate < completed */
+ estimate = std::max(estimate,
+ mysql_stage_get_work_completed(m_progress));
+
+ mysql_stage_set_work_estimated(m_progress, estimate);
+}
+
+/** Change the current phase.
+@param[in] new_stage pointer to the new stage to change to */
+inline
+void
+ut_stage_alter_t::change_phase(
+ const PSI_stage_info* new_stage)
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) {
+ m_cur_phase = READ_PK;
+ } else if (new_stage == &srv_stage_alter_table_merge_sort) {
+ m_cur_phase = SORT;
+ } else if (new_stage == &srv_stage_alter_table_insert) {
+ m_cur_phase = INSERT;
+ /* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
+ } else if (new_stage == &srv_stage_alter_table_log_index) {
+ m_cur_phase = LOG_INNODB_INDEX;
+ } else if (new_stage == &srv_stage_alter_table_log_table) {
+ m_cur_phase = LOG_INNODB_TABLE;
+ } else if (new_stage == &srv_stage_alter_table_end) {
+ m_cur_phase = END;
+ } else {
+ ut_error;
+ }
+
+ const ulonglong c = mysql_stage_get_work_completed(m_progress);
+ const ulonglong e = mysql_stage_get_work_estimated(m_progress);
+
+ m_progress = mysql_set_stage(new_stage->m_key);
+
+ mysql_stage_set_work_completed(m_progress, c);
+ mysql_stage_set_work_estimated(m_progress, e);
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+
+class ut_stage_alter_t {
+public:
+ explicit ut_stage_alter_t(const dict_index_t*) {}
+
+ void begin_phase_read_pk(ulint) {}
+
+ void n_pk_recs_inc() {}
+
+ void inc() {}
+ void inc(ulint) {}
+
+ void end_phase_read_pk() {}
+
+ void begin_phase_sort(double) {}
+
+ void begin_phase_insert() {}
+
+ void begin_phase_log_index() {}
+
+ void begin_phase_log_table() {}
+
+ void begin_phase_end() {}
+};
+
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+#endif /* ut0stage_h */
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
new file mode 100644
index 00000000..fe16ce14
--- /dev/null
+++ b/storage/innobase/include/ut0ut.h
@@ -0,0 +1,444 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include <ostream>
+#include <sstream>
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "db0err.h"
+
+#include <time.h>
+
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif /* MYSQL_SERVER */
+
+#include <stdarg.h>
+
+#include <string>
+
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR "\377"
+
+#define ut_max std::max
+#define ut_min std::min
+
+/** Calculate the minimum of two pairs.
+@param[out] min_hi MSB of the minimum pair
+@param[out] min_lo LSB of the minimum pair
+@param[in] a_hi MSB of the first pair
+@param[in] a_lo LSB of the first pair
+@param[in] b_hi MSB of the second pair
+@param[in] b_lo LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+ ulint* min_hi,
+ ulint* min_lo,
+ ulint a_hi,
+ ulint a_lo,
+ ulint b_hi,
+ ulint b_lo);
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b); /*!< in: ulint */
+/** Compare two pairs of integers.
+@param[in] a_h more significant part of first pair
+@param[in] a_l less significant part of first pair
+@param[in] b_h more significant part of second pair
+@param[in] b_l less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+ ulint a_h,
+ ulint a_l,
+ ulint b_h,
+ ulint b_l)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n in: numerator
+@param m in: denominator, must be a power of two
+@return the remainder of n/m */
+template <typename T> inline T ut_2pow_remainder(T n, T m){return n & (m - 1);}
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two. In other words, rounds n down to m * k.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+template <typename T> inline T ut_2pow_round(T n, T m) { return n & ~(m - 1); }
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two. In other words, rounds n up to m * k.
+@param n in: number to round up
+@param m in: alignment, must be a power of two
+@return n rounded up to the smallest possible integer multiple of m */
+#define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1))
+template <typename T> inline T ut_calc_align(T n, T m)
+{ return static_cast<T>(UT_CALC_ALIGN(n, m)); }
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n); /*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n); /*!< in: number */
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch. The
+value may wrap around. It should only be used for heuristic
+purposes.
+@return ms since epoch */
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b in: bits
+@return number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) >> 3)
+
+/** Determines if a number is zero or a power of two.
+@param[in] n number
+@return nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) (!((n) & ((n) - 1)))
+
+/** Functor that compares two C strings. Can be used as a comparator for
+e.g. std::map that uses char* as keys. */
+struct ut_strcmp_functor
+{
+ bool operator()(
+ const char* a,
+ const char* b) const
+ {
+ return(strcmp(a, b) < 0);
+ }
+};
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+void
+ut_print_timestamp(
+/*===============*/
+ FILE* file) /*!< in: file where to print */
+ ATTRIBUTE_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+void
+ut_sprintf_timestamp(
+/*=================*/
+ char* buf); /*!< in: buffer where to sprintf */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+ FILE* file, /*!< in: file where to print */
+ const void* buf, /*!< in: memory buffer */
+ ulint len); /*!< in: length of the buffer */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex. */
+void
+ut_print_buf_hex(
+/*=============*/
+ std::ostream& o, /*!< in/out: output stream */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+ std::ostream& o, /*!< in/out: output stream */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+ MY_ATTRIBUTE((nonnull));
+
+/* Forward declaration of transaction handle */
+struct trx_t;
+
+/** Get a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier.
+ @param [in] trx transaction (NULL=no quotes).
+ @param [in] name table name.
+ @retval String quoted as an SQL identifier.
+*/
+std::string
+ut_get_name(
+ const trx_t* trx,
+ const char* name);
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+void
+ut_print_name(
+/*==========*/
+ FILE* ef, /*!< in: stream */
+ const trx_t* trx, /*!< in: transaction */
+ const char* name); /*!< in: table name to print */
+/** Format a table name, quoted as an SQL identifier.
+If the name contains a slash '/', the result will contain two
+identifiers separated by a period (.), as in SQL
+database_name.table_name.
+@see table_name_t
+@param[in] name table or index name
+@param[out] formatted formatted result, will be NUL-terminated
+@param[in] formatted_size size of the buffer in bytes
+@return pointer to 'formatted' */
+char*
+ut_format_name(
+ const char* name,
+ char* formatted,
+ ulint formatted_size);
+
+/**********************************************************************//**
+Catenate files. */
+void
+ut_copy_file(
+/*=========*/
+ FILE* dest, /*!< in: output file */
+ FILE* src); /*!< in: input file to be appended to output */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return string, describing the error */
+const char*
+ut_strerr(
+/*======*/
+ dberr_t num); /*!< in: error number */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+namespace ib {
+
+/** This is a wrapper class, used to print any unsigned integer type
+in hexadecimal format. The main purpose of this data type is to
+overload the global operator<<, so that we can print the given
+wrapper value in hex. */
+struct hex {
+ explicit hex(uintmax_t t): m_val(t) {}
+ const uintmax_t m_val;
+};
+
+/** This is an overload of the global operator<< for the user defined type
+ib::hex. The unsigned value held in the ib::hex wrapper class will be printed
+into the given output stream in hexadecimal format.
+@param[in,out] lhs the output stream into which rhs is written.
+@param[in] rhs the object to be written into lhs.
+@retval reference to the output stream. */
+inline
+std::ostream&
+operator<<(
+ std::ostream& lhs,
+ const hex& rhs)
+{
+ std::ios_base::fmtflags ff = lhs.flags();
+ lhs << std::showbase << std::hex << rhs.m_val;
+ lhs.setf(ff);
+ return(lhs);
+}
+
+/** This is a wrapper class, used to print any number in IEC style */
+struct bytes_iec {
+ explicit bytes_iec(unsigned long long t): m_val(t) {}
+ double get_double() const { return static_cast<double>(m_val); }
+ const unsigned long long m_val;
+};
+
+/** Like hex operator above, except for bytes_iec */
+std::ostream &operator<<(std::ostream &lhs, const bytes_iec &rhs);
+
+/** The class logger is the base class of all the error log related classes.
+It contains a std::ostringstream object. The main purpose of this class is
+to forward operator<< to the underlying std::ostringstream object. Do not
+use this class directly, instead use one of the derived classes. */
+class logger
+{
+protected:
+ /* This class must not be used directly */
+ ATTRIBUTE_COLD ATTRIBUTE_NOINLINE logger() = default;
+public:
+ template<typename T> ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+ logger& operator<<(const T& rhs)
+ {
+ m_oss << rhs;
+ return *this;
+ }
+
+ /** Handle a fixed character string in the same way as a pointer to
+ an unknown-length character string, to reduce object code bloat. */
+ template<size_t N> logger& operator<<(const char (&rhs)[N])
+ { return *this << static_cast<const char*>(rhs); }
+
+ /** Output an error code name */
+ ATTRIBUTE_COLD logger& operator<<(dberr_t err);
+
+ /** Append a string.
+ @param buf string buffer
+ @param size buffer size
+ @return the output stream */
+ ATTRIBUTE_COLD __attribute__((noinline))
+ std::ostream &write(const char *buf, std::streamsize size)
+ {
+ return m_oss.write(buf, size);
+ }
+
+ std::ostream &write(const byte *buf, std::streamsize size)
+ { return write(reinterpret_cast<const char*>(buf), size); }
+
+ std::ostringstream m_oss;
+};
+
+/** The class info is used to emit informational log messages. It is to be
+used similar to std::cout. But the log messages will be emitted only when
+the dtor is called. The preferred usage of this class is to make use of
+unnamed temporaries as follows:
+
+info() << "The server started successfully.";
+
+In the above usage, the temporary object will be destroyed at the end of the
+statement and hence the log message will be emitted at the end of the
+statement. If a named object is created, then the log message will be emitted
+only when it goes out of scope or destroyed. */
+class info : public logger {
+public:
+ ATTRIBUTE_COLD
+ ~info();
+};
+
+/** The class warn is used to emit warnings. Refer to the documentation of
+class info for further details. */
+class warn : public logger {
+public:
+ ATTRIBUTE_COLD
+ ~warn();
+};
+
+/** The class error is used to emit error messages. Refer to the
+documentation of class info for further details. */
+class error : public logger {
+public:
+ ATTRIBUTE_COLD
+ ~error();
+ /** Indicates that error::~error() was invoked. Can be used to
+ determine if error messages were logged during innodb code execution.
+ @return true if there were error messages, false otherwise. */
+ static bool was_logged() { return logged; }
+
+private:
+ /** true if error::~error() was invoked, false otherwise */
+ static bool logged;
+};
+
+/** The class fatal is used to emit an error message and stop the server
+by crashing it. Use this class when MySQL server needs to be stopped
+immediately. Refer to the documentation of class info for usage details. */
+class fatal : public logger {
+public:
+ ATTRIBUTE_NORETURN
+ ~fatal();
+};
+
+/** Emit an error message if the given predicate is true, otherwise emit a
+warning message */
+class error_or_warn : public logger {
+public:
+ ATTRIBUTE_COLD
+ error_or_warn(bool pred)
+ : m_error(pred)
+ {}
+
+ ATTRIBUTE_COLD
+ ~error_or_warn();
+private:
+ const bool m_error;
+};
+
+/** Emit a fatal message if the given predicate is true, otherwise emit a
+error message. */
+class fatal_or_error : public logger {
+public:
+ ATTRIBUTE_COLD
+ fatal_or_error(bool pred)
+ : m_fatal(pred)
+ {}
+
+ ATTRIBUTE_COLD
+ ~fatal_or_error();
+private:
+ const bool m_fatal;
+};
+
+} // namespace ib
+
+#include "ut0ut.inl"
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.inl b/storage/innobase/include/ut0ut.inl
new file mode 100644
index 00000000..73feaf82
--- /dev/null
+++ b/storage/innobase/include/ut0ut.inl
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#include <algorithm>
+
+/** Calculate the minimum of two pairs.
+@param[out] min_hi MSB of the minimum pair
+@param[out] min_lo LSB of the minimum pair
+@param[in] a_hi MSB of the first pair
+@param[in] a_lo LSB of the first pair
+@param[in] b_hi MSB of the second pair
+@param[in] b_lo LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+ ulint* min_hi,
+ ulint* min_lo,
+ ulint a_hi,
+ ulint a_lo,
+ ulint b_hi,
+ ulint b_lo)
+{
+ if (a_hi == b_hi) {
+ *min_hi = a_hi;
+ *min_lo = std::min(a_lo, b_lo);
+ } else if (a_hi < b_hi) {
+ *min_hi = a_hi;
+ *min_lo = a_lo;
+ } else {
+ *min_hi = b_hi;
+ *min_lo = b_lo;
+ }
+}
+
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b) /*!< in: ulint */
+{
+ if (a < b) {
+ return(-1);
+ } else if (a == b) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+/** Compare two pairs of integers.
+@param[in] a_h more significant part of first pair
+@param[in] a_l less significant part of first pair
+@param[in] b_h more significant part of second pair
+@param[in] b_l less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+ ulint a_h,
+ ulint a_l,
+ ulint b_h,
+ ulint b_l)
+{
+ if (a_h < b_h) {
+ return(-1);
+ }
+ if (a_h > b_h) {
+ return(1);
+ }
+ return(ut_ulint_cmp(a_l, b_l));
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n) /*!< in: number != 0 */
+{
+ ulint res;
+
+ res = 0;
+
+ ut_ad(n > 0);
+
+ n = n - 1;
+
+ for (;;) {
+ n = n / 2;
+
+ if (n == 0) {
+ break;
+ }
+
+ res++;
+ }
+
+ return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n) /*!< in: number */
+{
+ return((ulint) 1 << n);
+}
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
new file mode 100644
index 00000000..f4660f96
--- /dev/null
+++ b/storage/innobase/include/ut0vec.h
@@ -0,0 +1,285 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "mem0mem.h"
+
+struct ib_alloc_t;
+struct ib_vector_t;
+
+typedef void* (*ib_mem_alloc_t)(
+ /* out: Pointer to allocated memory */
+ ib_alloc_t* allocator, /* in: Pointer to allocator instance */
+ ulint size); /* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+ ib_alloc_t* allocator, /* in: Pointer to allocator instance */
+ void* ptr); /* in: Memory to free */
+
+typedef void* (*ib_mem_resize_t)(
+ /* out: Pointer to resized memory */
+ ib_alloc_t* allocator, /* in: Pointer to allocator */
+ void* ptr, /* in: Memory to resize */
+ ulint old_size, /* in: Old memory size in bytes */
+ ulint new_size); /* in: New size in bytes */
+
+typedef int (*ib_compare_t)(const void*, const void*);
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -All memory allocation is done through an allocator, which is responsible for
+freeing it when done with the vector.
+*/
+
+/* This is useful shorthand for elements of type void* */
+#define ib_vector_getp(v, n) (*(void**) ib_vector_get(v, n))
+#define ib_vector_getp_const(v, n) (*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v) (v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+ /* out: vector */
+ ib_alloc_t* alloc, /* in: Allocator */
+ /* in: size of the data item */
+ ulint sizeof_value,
+ ulint size); /* in: initial size */
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+ /* out: pointer the "new" element */
+ ib_vector_t* vec, /* in/out: vector */
+ const void* elem); /* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ /* out: pointer to the "new" element */
+ ib_vector_t* vec); /* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+ ib_vector_t* vec, /*!< in: vector */
+ const void* elem); /*!< in: value to remove */
+
+/********************************************************************
+Get the number of elements in the vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ /* out: number of elements in vector */
+ const ib_vector_t* vec); /* in: vector */
+
+/********************************************************************
+Increase the size of the vector. */
+void
+ib_vector_resize(
+/*=============*/
+ /* out: number of elements in vector */
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n); /*!< in: element index to get */
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+ const ib_vector_t* vec, /* in: vector */
+ ulint n); /* in: element index to get */
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+ ib_vector_t* vec); /*!< in: vector */
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+ ib_vector_t* vec, /*!< in/out: vector */
+ ulint n, /*!< in: element index to set */
+ void* elem); /*!< in: data element */
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+ /* out: pointer to last element */
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+ /* out: pointer to last element */
+ const ib_vector_t* vec); /* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+ ib_vector_t* vec, /* in/out: vector */
+ ib_compare_t compare); /* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+ ib_alloc_t* allocator, /* in: allocator */
+ void* ptr); /* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+ /* out: pointer to allocated memory */
+ ib_alloc_t* allocator, /* in: allocator */
+ ulint size); /* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+ /* out: pointer to reallocated
+ memory */
+ ib_alloc_t* allocator, /* in: allocator */
+ void* old_ptr, /* in: pointer to memory */
+ ulint old_size, /* in: old size in bytes */
+ ulint new_size); /* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+ /* out: heap allocator instance */
+ mem_heap_t* heap); /* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+ ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_t {
+ ib_mem_alloc_t mem_malloc; /* For allocating memory */
+ ib_mem_free_t mem_release; /* For freeing memory */
+ ib_mem_resize_t mem_resize; /* For resizing memory */
+ void* arg; /* Currently if not NULL then it
+ points to the heap instance */
+};
+
+/* See comment at beginning of file. */
+struct ib_vector_t {
+ ib_alloc_t* allocator; /* Allocator, because one size
+ doesn't fit all */
+ void* data; /* data elements */
+ ulint used; /* number of elements currently used */
+ ulint total; /* number of elements allocated */
+ /* Size of a data item */
+ ulint sizeof_value;
+};
+
+#include "ut0vec.inl"
+
+#endif /* IB_VECTOR_H */
diff --git a/storage/innobase/include/ut0vec.inl b/storage/innobase/include/ut0vec.inl
new file mode 100644
index 00000000..531f0f22
--- /dev/null
+++ b/storage/innobase/include/ut0vec.inl
@@ -0,0 +1,348 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#define IB_VEC_OFFSET(v, i) (vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+ ib_alloc_t* allocator, /* in: allocator */
+ ulint size) /* in: size in bytes */
+{
+ mem_heap_t* heap = (mem_heap_t*) allocator->arg;
+
+ return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+ ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */
+ void* ptr UNIV_UNUSED) /* in: size in bytes */
+{
+ /* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+We always assume new_size >= old_size, so the buffer won't overflow.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+ ib_alloc_t* allocator, /* in: allocator */
+ void* old_ptr, /* in: pointer to memory */
+ ulint old_size, /* in: old size in bytes */
+ ulint new_size) /* in: new size in bytes */
+{
+ void* new_ptr;
+ mem_heap_t* heap = (mem_heap_t*) allocator->arg;
+
+ ut_a(new_size >= old_size);
+ new_ptr = mem_heap_alloc(heap, new_size);
+ memcpy(new_ptr, old_ptr, old_size);
+
+ return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+ mem_heap_t* heap) /* in: heap to use */
+{
+ ib_alloc_t* heap_alloc;
+
+ heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+ heap_alloc->arg = heap;
+ heap_alloc->mem_release = ib_heap_free;
+ heap_alloc->mem_malloc = ib_heap_malloc;
+ heap_alloc->mem_resize = ib_heap_resize;
+
+ return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+ ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */
+{
+ mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Get number of elements in vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ /* out: number of elements in vector*/
+ const ib_vector_t* vec) /* in: vector */
+{
+ return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element. */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n) /*!< in: element index to get */
+{
+ ut_a(n < vec->used);
+
+ return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+ const ib_vector_t* vec, /* in: vector */
+ ulint n) /* in: element index to get */
+{
+ ut_a(n < vec->used);
+
+ return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+ ib_vector_t* vec) /*!< in: vector */
+{
+ ut_a(vec->used > 0);
+
+ return((byte*) ib_vector_get(vec, vec->used - 1));
+}
+
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+ ib_vector_t* vec, /*!< in/out: vector */
+ ulint n, /*!< in: element index to set */
+ void* elem) /*!< in: data element */
+{
+ void* slot;
+
+ ut_a(n < vec->used);
+
+ slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+ memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+ /* out: void */
+ ib_vector_t* vec) /* in: vector */
+{
+ vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+ /* out: void */
+ ib_vector_t* vec) /* in: vector */
+{
+ ut_a(ib_vector_size(vec) > 0);
+
+ return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+ /* out: void */
+ const ib_vector_t* vec) /* in: vector */
+{
+ ut_a(ib_vector_size(vec) > 0);
+
+ return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ /* out: pointer to element */
+ ib_vector_t* vec) /* in: vector */
+{
+ void* elem;
+
+ ut_a(vec->used > 0);
+
+ elem = ib_vector_last(vec);
+ --vec->used;
+
+ return(elem);
+}
+
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+ /* out: pointer to the "new" element */
+ ib_vector_t* vec, /* in: vector */
+ const void* elem) /* in: element to add (can be NULL) */
+{
+ void* last;
+
+ if (vec->used >= vec->total) {
+ ib_vector_resize(vec);
+ }
+
+ last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+ memset(last, 0, vec->sizeof_value);
+#endif
+
+ if (elem) {
+ memcpy(last, elem, vec->sizeof_value);
+ }
+
+ ++vec->used;
+
+ return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+ ib_vector_t* vec, /*!< in: vector */
+ const void* elem) /*!< in: value to remove */
+{
+ void* current = NULL;
+ void* next;
+ ulint i;
+ ulint old_used_count = vec->used;
+
+ for (i = 0; i < vec->used; i++) {
+ current = ib_vector_get(vec, i);
+
+ if (*(void**) current == elem) {
+ if (i == vec->used - 1) {
+ return(ib_vector_pop(vec));
+ }
+
+ next = ib_vector_get(vec, i + 1);
+ memmove(current, next, vec->sizeof_value
+ * (vec->used - i - 1));
+ --vec->used;
+ break;
+ }
+ }
+
+ return((old_used_count != vec->used) ? current : NULL);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+ /* out: void */
+ ib_vector_t* vec, /* in: vector */
+ ib_compare_t compare)/* in: the comparator to use for sort */
+{
+ qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec) /* in, own: vector */
+{
+ /* Currently we only support one type of allocator - heap,
+ when the heap is freed all the elements are freed too. */
+
+ /* Only the heap allocator uses the arg field. */
+ ut_ad(vec->allocator->arg != NULL);
+
+ mem_heap_free((mem_heap_t*) vec->allocator->arg);
+}
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec) /*!< in: vector */
+{
+ return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
new file mode 100644
index 00000000..95c7a248
--- /dev/null
+++ b/storage/innobase/include/ut0wqueue.h
@@ -0,0 +1,86 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#pragma once
+
+#include "ut0list.h"
+#include "mem0mem.h"
+
+// Forward declaration
+struct ib_list_t;
+
+/** Work queue */
+struct ib_wqueue_t
+{
+ /** Mutex protecting everything */
+ mysql_mutex_t mutex;
+ /** Work item list */
+ ib_list_t *items;
+ /** ib_list_len(*items) */
+ size_t length;
+};
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+ib_wqueue_t*
+ib_wqueue_create();
+/*===============*/
+
+/****************************************************************//**
+Free a work queue. */
+void
+ib_wqueue_free(
+/*===========*/
+ ib_wqueue_t* wq); /*!< in: work queue */
+
+/** Add a work item to the queue.
+@param[in,out] wq work queue
+@param[in] item work item
+@param[in,out] heap memory heap to use for allocating list node
+@param[in] wq_locked work queue mutex locked */
+void
+ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap,
+ bool wq_locked = false);
+
+/** Check if queue is empty.
+@param wq wait queue
+@return whether the queue is empty */
+bool ib_wqueue_is_empty(ib_wqueue_t* wq);
+
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+ ib_wqueue_t* wq); /*<! in: work queue */